Data Science Progamming
Mei 23, 2025
Bab 1 VISUALISASI DESKRIPTIF
## C:\Users\HP\AppData\Local\Programs\Python\PYTHON~1\python.exe
library(DT)
data <- read.csv("data/data-dummy.csv")
datatable(data, options = list(pageLength = 10))
1.1 Numerical Data
1.1.1 Python
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Load data
data = pd.read_csv("data/data-dummy.csv")
# Pastikan kolom 'Quantity' ada dan tidak null
quantity = data['Quantity'].dropna()
# Statistik
Q1 = quantity.quantile(0.25)
Q3 = quantity.quantile(0.75)
IQR = Q3 - Q1
mean = quantity.mean()
min_val = quantity.min()
max_val = quantity.max()
outliers = quantity[(quantity < Q1 - 1.5 * IQR) | (quantity > Q3 + 1.5 * IQR)]
# Plot
plt.figure(figsize=(10, 6))
sns.boxplot(y=quantity, color="skyblue", width=0.4, fliersize=0)
sns.stripplot(y=quantity, color='black', size=3, alpha=0.5, jitter=True)
sns.stripplot(y=outliers, color='red', size=5, jitter=True)
# Garis vertikal tengah
plt.axhline(mean, color='blue', linestyle='--', linewidth=1)
# Anotasi
def annotate_text(y, text, color, bold=False, italic=False):
plt.text(
1.2, y, text,
fontsize=10,
color=color,
ha='left',
fontstyle='italic' if italic else 'normal',
fontweight='bold' if bold else 'normal'
)
annotate_text(Q1, f"Q1: {Q1:.1f}", 'green')
annotate_text(Q3, f"Q3: {Q3:.1f}", 'green')
annotate_text(mean, f"Mean: {mean:.2f}", 'blue', bold=True)
annotate_text(min_val, f"Min: {min_val}", 'orange')
annotate_text(max_val, f"Max: {max_val}", 'orange')
annotate_text(outliers.max(), f"Outliers: {len(outliers)}", 'red', bold=True)
# Styling
plt.title("Boxplot of Quantity with Jitter", fontsize=16, weight='bold')
plt.ylabel("Quantity")
plt.xticks([]) # Sembunyikan x-axis karena cuma 1 variabel
## ([], [])
1.1.2 Violin Plot
# ==============================
# 1. Import Library
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# ==============================
# 2. Load dan Persiapan Data
# ==============================
# Ganti dengan path CSV kamu
data_bisnis = pd.read_csv("data/data-dummy.csv")
# Pastikan kolom Quantity bertipe numerik
data_bisnis["Quantity"] = pd.to_numeric(data_bisnis["Quantity"], errors="coerce")
data_bisnis = data_bisnis.dropna(subset=["Quantity"])
# Hitung statistik untuk anotasi
Q1 = data_bisnis["Quantity"].quantile(0.25)
Q3 = data_bisnis["Quantity"].quantile(0.75)
IQR = Q3 - Q1
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR
mean_val = data_bisnis["Quantity"].mean()
median_val = data_bisnis["Quantity"].median()
min_val = data_bisnis["Quantity"].min()
max_val = data_bisnis["Quantity"].max()
outliers_count = ((data_bisnis["Quantity"] > upper_whisker) | (data_bisnis["Quantity"] < lower_whisker)).sum()
# Tandai outlier
data_bisnis["is_outlier"] = data_bisnis["Quantity"].apply(
lambda x: "Outlier" if x < lower_whisker or x > upper_whisker else "Normal"
)
# ==============================
# 3. Plot Violin + Box + Jitter + Anotasi
# ==============================
plt.figure(figsize=(16, 10))
sns.violinplot(
x="dummy", y="Quantity", data=data_bisnis.assign(dummy=1),
inner=None, color="skyblue"
)
sns.boxplot(
x="dummy", y="Quantity", data=data_bisnis.assign(dummy=1),
width=0.2, showcaps=True, boxprops={'facecolor': 'none'},
showfliers=False
)
sns.stripplot(
x="dummy", y="Quantity", data=data_bisnis.assign(dummy=1),
hue="is_outlier", palette={"Normal": "black", "Outlier": "red"},
jitter=0.15, size=4, alpha=0.7, dodge=False
)
# Tambahkan anotasi statistik
plt.text(1.1, mean_val, f"Mean: {mean_val:.2f}", color="blue", fontsize=14, fontweight="bold")
plt.text(1.1, Q1, f"Q1: {Q1:.2f}", color="green", fontsize=13)
plt.text(1.1, median_val, f"Median: {median_val:.2f}", color="purple", fontsize=13)
plt.text(1.1, Q3, f"Q3: {Q3:.2f}", color="green", fontsize=13)
plt.text(1.1, min_val, f"Min: {min_val:.2f}", color="orange", fontsize=13)
plt.text(1.1, max_val, f"Max: {max_val:.2f}", color="orange", fontsize=13)
plt.text(1, max_val + 0.3, f"Outliers: {outliers_count}", color="red", fontsize=14, style="italic", ha="center")
# Judul dan tampilan
plt.title("Violin Plot of Quantity with Outlier Highlighted", fontsize=24, weight="bold")
plt.ylabel("Quantity", fontsize=18)
plt.xticks([], []) # sembunyikan sumbu X karena dummy
## ([], [])
## (array([-2., 0., 2., 4., 6., 8., 10., 12.]), [Text(0, -2.0, '−2'), Text(0, 0.0, '0'), Text(0, 2.0, '2'), Text(0, 4.0, '4'), Text(0, 6.0, '6'), Text(0, 8.0, '8'), Text(0, 10.0, '10'), Text(0, 12.0, '12')])
plt.legend(title="Point Type", title_fontsize=15, fontsize=13, loc="upper right")
sns.despine()
plt.tight_layout()
plt.show()
1.2 Combo
1.2.1 Grouped Bar Chart
# ==============================
# 1. Import Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# ==============================
# 2. Load Data
# ==============================
data_bisnis = pd.read_csv("data/data-dummy.csv")
# ==============================
# 3. Data Summarization
# ==============================
sales_summary = (
data_bisnis
.groupby(['Product_Category', 'Region'], as_index=False)
.agg(Total_Sales=('Total_Price', 'sum'))
)
# ==============================
# 4. Plot Grouped Bar Chart
# ==============================
plt.figure(figsize=(12, 6))
sns.barplot(
data=sales_summary,
x="Product_Category",
y="Total_Sales",
hue="Region",
dodge=True
)
plt.title("Total Sales by Product Category and Region", fontsize=16, fontweight="bold")
plt.xlabel("Product Category", fontsize=14)
plt.ylabel("Total Sales (USD)", fontsize=14)
plt.xticks(rotation=45, ha='right')
## ([0, 1, 2, 3, 4], [Text(0, 0, 'Books'), Text(1, 0, 'Clothing'), Text(2, 0, 'Electronics'), Text(3, 0, 'Groceries'), Text(4, 0, 'Home')])
1.2.2 Redgeline Plot
# ==============================
# 1. Load Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
from joypy import joyplot
import seaborn as sns
import matplotlib.ticker as ticker
# ==============================
# 2. Filter Valid Data
# ==============================
data_bisnis = pd.read_csv("data/data-dummy.csv")
# Filter hanya nilai Price_per_Unit yang finite (bukan NaN, Inf, -Inf)
data_bisnis_filtered = data_bisnis[pd.to_numeric(data_bisnis['Price_per_Unit'], errors='coerce').notna()]
data_bisnis_filtered = data_bisnis_filtered[np.isfinite(data_bisnis_filtered['Price_per_Unit'])]
# ==============================
# 3. Create Ridgeline Plot
# ==============================
plt.figure(figsize=(16, 10))
joyplot(
data_bisnis_filtered,
by="Region",
column="Price_per_Unit",
figsize=(16, 10),
fade=True,
alpha=0.7,
colormap=plt.cm.tab10,
linewidth=1
)
plt.title("Distribution of Price per Unit by Region", fontsize=24, weight="bold")
plt.xlabel("Price per Unit", fontsize=18)
plt.ylabel("Region", fontsize=18)
# Format Rupiah (e.g., Rp1.000)
def rupiah_format(x, pos):
return f"Rp{int(x):,}".replace(",", ".") # Format pakai titik sebagai pemisah ribuan
plt.gca().xaxis.set_major_formatter(ticker.FuncFormatter(rupiah_format))
plt.xticks(fontsize=14)
## (array([-5., 0., 5., 10., 15., 20., 25., 30.]), [Text(-5.0, 0, 'Rp-5'), Text(0.0, 0, 'Rp0'), Text(5.0, 0, 'Rp5'), Text(10.0, 0, 'Rp10'), Text(15.0, 0, 'Rp15'), Text(20.0, 0, 'Rp20'), Text(25.0, 0, 'Rp25'), Text(30.0, 0, 'Rp30')])
## (array([0. , 0.2, 0.4, 0.6, 0.8, 1. ]), [Text(0, 0.0, '0.0'), Text(0, 0.2, '0.2'), Text(0, 0.4, '0.4'), Text(0, 0.6000000000000001, '0.6'), Text(0, 0.8, '0.8'), Text(0, 1.0, '1.0')])
1.2.3 Boxplot by Category
# ==============================
# 1. Import Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# ==============================
# 2. Prepare Data
# ==============================
data_bisnis = pd.read_csv("data/data-dummy.csv")
# Pastikan Quantity numerik dan hapus nilai NaN
data_bisnis['Quantity'] = pd.to_numeric(data_bisnis['Quantity'], errors='coerce')
data_bisnis = data_bisnis.dropna(subset=['Quantity'])
# ==============================
# 3. Create Boxplot
# ==============================
plt.figure(figsize=(18, 10)) # Sesuaikan ukuran agar teks terlihat jelas
sns.boxplot(
data=data_bisnis,
x="Product_Category",
y="Quantity",
palette="Set2",
showfliers=True, # Tampilkan outlier
fliersize=6, # Ukuran titik outlier
flierprops=dict(marker='o', markerfacecolor='red', markersize=8) # Outlier merah
)
plt.title("Boxplot of Quantity by Product Category", fontsize=30, weight='bold')
plt.xlabel("Product Category", fontsize=25)
plt.ylabel("Quantity", fontsize=25)
plt.xticks(fontsize=20, rotation=0)
## ([0, 1, 2, 3, 4], [Text(0, 0, 'Clothing'), Text(1, 0, 'Electronics'), Text(2, 0, 'Home'), Text(3, 0, 'Groceries'), Text(4, 0, 'Books')])
## (array([-2., 0., 2., 4., 6., 8., 10.]), [Text(0, -2.0, '−2'), Text(0, 0.0, '0'), Text(0, 2.0, '2'), Text(0, 4.0, '4'), Text(0, 6.0, '6'), Text(0, 8.0, '8'), Text(0, 10.0, '10')])
1.2.4 Lollipop Chart
# ==============================
# 1. Import Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# ==============================
# 2. Prepare Data
# ==============================
data_bisnis = pd.read_csv("data/data-dummy.csv")
# Hitung total sales berdasarkan Product_Category dan Region
sales_grouped = data_bisnis.groupby(['Product_Category', 'Region'], as_index=False)['Total_Price'].sum()
sales_grouped.rename(columns={'Total_Price': 'Total_Sales'}, inplace=True)
# ==============================
# 3. Grouped Lollipop Chart
# ==============================
plt.figure(figsize=(18, 10))
# Gunakan palette yang mendekati ggplot2 (defaultnya "Set1" sudah cukup mirip)
colors = sns.color_palette("Set1", n_colors=sales_grouped['Region'].nunique())
# Pastikan urutan y mengikuti dari kecil ke besar total (mirip reorder() di R)
category_order = sales_grouped.groupby("Product_Category")["Total_Sales"].sum().sort_values().index.tolist()
# Buat plot per Region
for i, region in enumerate(sales_grouped['Region'].unique()):
subset = sales_grouped[sales_grouped['Region'] == region]
# Gunakan order yang sesuai dengan R (y = Product_Category yang sudah diurutkan)
subset = subset.set_index("Product_Category").loc[category_order].reset_index()
plt.hlines(y=subset['Product_Category'], xmin=0, xmax=subset['Total_Sales'],
color=colors[i], linewidth=5, label=region)
plt.plot(subset['Total_Sales'], subset['Product_Category'], "o",
color=colors[i], markersize=10)
# ==============================
# 4. Styling
# ==============================
plt.xlabel("Total Sales", fontsize=30)
plt.ylabel("Product Category", fontsize=30)
plt.title("Grouped Lollipop Chart", fontsize=20, weight='bold')
plt.xticks(fontsize=20)
## (array([-200., 0., 200., 400., 600., 800., 1000., 1200., 1400.,
## 1600.]), [Text(-200.0, 0, '−200'), Text(0.0, 0, '0'), Text(200.0, 0, '200'), Text(400.0, 0, '400'), Text(600.0, 0, '600'), Text(800.0, 0, '800'), Text(1000.0, 0, '1000'), Text(1200.0, 0, '1200'), Text(1400.0, 0, '1400'), Text(1600.0, 0, '1600')])
## ([0, 1, 2, 3, 4], [Text(0, 0, 'Electronics'), Text(0, 1, 'Groceries'), Text(0, 2, 'Books'), Text(0, 3, 'Home'), Text(0, 4, 'Clothing')])
1.2.5 Heatmap
Python Code
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load data
df = pd.read_csv("data/data-dummy.csv")
# Pivot tabel untuk heatmap: Total Quantity by Product Category and Region
pivot_qty = df.pivot_table(
values="Quantity",
index="Product_Category",
columns="Region",
aggfunc="sum",
fill_value=0
)
# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(pivot_qty, annot=True, fmt=".0f", cmap="Blues", linewidths=0.5)
plt.title("Heatmap: Quantity Sold by Product Category and Region", fontsize=16, weight="bold")
plt.xlabel("Region")
plt.ylabel("Product Category")
plt.tight_layout()
plt.show()
R Code
library(ggplot2)
library(dplyr)
# Load data
data_bisnis <- read.csv("data/data-dummy.csv", stringsAsFactors = FALSE)
# Hitung total Quantity berdasarkan kategori produk dan region
heatmap_data <- data_bisnis %>%
group_by(Product_Category, Region) %>%
summarise(Total_Quantity = sum(Quantity, na.rm = TRUE), .groups = "drop")
# Buat heatmap
ggplot(heatmap_data, aes(x = Region, y = Product_Category, fill = Total_Quantity)) +
geom_tile(color = "white") +
geom_text(aes(label = Total_Quantity), color = "white", size = 5) +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
labs(
title = "Heatmap: Total Quantity by Product Category and Region",
x = "Region",
y = "Product Category",
fill = "Quantity"
) +
theme_minimal(base_size = 16)
1.3 Relationship
1.3.1 Scatter Plot
Python Code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load data
data_bisnis = pd.read_csv("data/data-dummy.csv")
# Hitung Total Sales dan Rata-rata Diskon per Product_Category dan Region
scatter_data = data_bisnis.groupby(['Product_Category', 'Region'], as_index=False).agg({
'Total_Price': 'sum',
'Discount': 'mean'
})
scatter_data.rename(columns={
'Total_Price': 'Total_Sales',
'Discount': 'Avg_Discount'
}, inplace=True)
# Setup gaya visual
sns.set(style="whitegrid")
# Plot
plt.figure(figsize=(12, 8))
sns.scatterplot(
data=scatter_data,
x="Avg_Discount",
y="Total_Sales",
hue="Region",
palette="deep",
s=150,
alpha=0.85,
edgecolor="black"
)
plt.title("Scatter Plot: Average Discount vs Total Sales", fontsize=20, weight='bold')
plt.xlabel("Average Discount", fontsize=14)
plt.ylabel("Total Sales", fontsize=14)
plt.xticks(fontsize=12)
## (array([0.12 , 0.125, 0.13 , 0.135, 0.14 , 0.145, 0.15 , 0.155, 0.16 ,
## 0.165]), [Text(0.12, 0, '0.120'), Text(0.125, 0, '0.125'), Text(0.13, 0, '0.130'), Text(0.135, 0, '0.135'), Text(0.13999999999999999, 0, '0.140'), Text(0.145, 0, '0.145'), Text(0.15, 0, '0.150'), Text(0.155, 0, '0.155'), Text(0.16, 0, '0.160'), Text(0.16499999999999998, 0, '0.165')])
## (array([ 200., 400., 600., 800., 1000., 1200., 1400., 1600.]), [Text(0, 200.0, '200'), Text(0, 400.0, '400'), Text(0, 600.0, '600'), Text(0, 800.0, '800'), Text(0, 1000.0, '1000'), Text(0, 1200.0, '1200'), Text(0, 1400.0, '1400'), Text(0, 1600.0, '1600')])
R Code
# ==============================
# 1. Load Libraries
# ==============================
library(ggplot2)
library(dplyr)
# ==============================
# 2. Load & Summarize Data
# ==============================
data_bisnis <- read.csv("data/data-dummy.csv", stringsAsFactors = FALSE)
scatter_data <- data_bisnis %>%
group_by(Product_Category, Region) %>%
summarise(
Total_Sales = sum(Total_Price, na.rm = TRUE),
Avg_Discount = mean(Discount, na.rm = TRUE),
.groups = "drop"
)
# ==============================
# 3. Create Scatter Plot
# ==============================
ggplot(scatter_data, aes(x = Avg_Discount, y = Total_Sales, color = Region)) +
geom_point(size = 5, alpha = 0.85) +
labs(
title = "Scatter Plot: Average Discount vs Total Sales",
x = "Average Discount",
y = "Total Sales",
color = "Region"
) +
theme_minimal(base_size = 16) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)
)
1.3.2 Bubble Chart
Python Code
import pandas as pd
import matplotlib.pyplot as plt
# Membaca dataset
data = pd.read_csv("data/data-dummy.csv")
# Memfilter data untuk menghilangkan baris dengan Quantity atau Total_Price nol
data_filtered = data[(data['Quantity'] > 0) & (data['Total_Price'] > 0)]
# Membuat palet warna untuk Region
colors = {'North': '#1b9e77', # Hijau tua
'South': '#d95f02', # Oranye
'East': '#7570b3', # Ungu
'West': '#e7298a'} # Pink
# Membuat bubble chart
plt.figure(figsize=(10, 6))
for region in data_filtered['Region'].unique():
subset = data_filtered[data_filtered['Region'] == region]
plt.scatter(subset['Quantity'],
subset['Total_Price'],
s=subset['Discount'] * 1000, # Skala diskon untuk ukuran gelembung
c=colors[region],
label=region,
alpha=0.6)
# Menambahkan judul dan label sumbu
plt.title('Bubble Chart: Kuantitas vs Total Harga (Ukuran: Diskon, Warna: Wilayah)',
fontsize=14, pad=10)
plt.xlabel('Kuantitas', fontsize=12)
plt.ylabel('Total Harga', fontsize=12)
# Menambahkan legenda dan grid
plt.legend(title='Wilayah')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
R Code
# Memuat library yang diperlukan
library(ggplot2)
library(dplyr)
# Membaca dataset
data <- read.csv("data/data-dummy.csv")
# Memfilter data untuk menghilangkan baris dengan Quantity atau Total_Price nol atau kosong
data_filtered <- data %>%
filter(Quantity > 0, Total_Price > 0)
# Membuat bubble chart
ggplot(data_filtered, aes(x = Quantity, y = Total_Price, size = Discount, color = Region)) +
geom_point(alpha = 0.6) + # Transparansi untuk menangani tumpang tindih
scale_size_continuous(range = c(3, 15)) + # Mengatur rentang ukuran gelembung
scale_color_manual(values = c("North" = "#1b9e77", # Hijau tua
"South" = "#d95f02", # Oranye
"East" = "#7570b3", # Ungu
"West" = "#e7298a")) + # Pink
labs(title = "Bubble Chart: Kuantitas vs Total Harga (Ukuran: Diskon, Warna: Wilayah)",
x = "Kuantitas",
y = "Total Harga",
size = "Diskon",
color = "Wilayah") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 8, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
legend.position = "right"
)
1.3.3 Correlation Matrix
Python Code
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Membaca dataset
data = pd.read_csv("data/data-dummy.csv")
# Memilih variabel untuk korelasi
vars = ['Unit_Price', 'Delivery_Time', 'Efficiency', 'Feature_Interaction']
data_selected = data[vars].dropna()
# Menghitung matriks korelasi (Pearson)
cor_pearson = data_selected.corr(method='pearson')
# Menghitung matriks korelasi (Spearman)
cor_spearman = data_selected.corr(method='spearman')
# Visualisasi matriks korelasi Pearson
plt.figure(figsize=(8, 6))
sns.heatmap(cor_pearson,
annot=True, # Menambahkan nilai korelasi
cmap='RdYlGn', # Palet warna: merah (negatif), kuning (netral), hijau (positif)
vmin=-1, vmax=1, # Rentang nilai korelasi
square=True,
fmt='.2f')
plt.title('Matriks Korelasi (Pearson)', fontsize=14, pad=10)
plt.show()
R Code
# Memuat library yang diperlukan
library(corrplot)
library(dplyr)
# Membaca dataset
data <- read.csv("data/data-dummy.csv")
# Memilih variabel untuk korelasi
vars <- c("Unit_Price", "Delivery_Time", "Efficiency", "Feature_Interaction")
data_selected <- data %>%
select(all_of(vars)) %>%
filter(complete.cases(.)) # Menghapus baris dengan nilai NA
# Menghitung matriks korelasi (Pearson)
cor_pearson <- cor(data_selected, method = "pearson")
# Menghitung matriks korelasi (Spearman)
cor_spearman <- cor(data_selected, method = "spearman")
# Visualisasi matriks korelasi Pearson
corrplot(cor_pearson,
method = "color",
type = "upper",
order = "hclust",
addCoef.col = "black", # Menambahkan nilai korelasi
tl.col = "black",
tl.srt = 45,
col = colorRampPalette(c("#d73027", "#f7f7f7", "#1a9850"))(100),
title = "Matriks Korelasi (Pearson)",
mar = c(0, 0, 2, 0))
1.4 Time Series
1.4.1 Line Chart
Python Code
import pandas as pd
import matplotlib.pyplot as plt
# Membaca dataset
data = pd.read_csv("data/data-dummy.csv")
# Mengonversi Transaction_Date ke format tanggal dan mengekstrak tahun-bulan
data['Date'] = pd.to_datetime(data['Transaction_Date'])
data['YearMonth'] = data['Date'].dt.to_period('M').dt.to_timestamp()
# Memfilter data yang valid
data = data[data['Total_Price'] > 0].dropna(subset=['Total_Price'])
# Mengelompokkan data berdasarkan YearMonth dan Region, menghitung rata-rata Total_Price
data_agg = data.groupby(['YearMonth', 'Region'])['Total_Price'].mean().reset_index()
# Membuat palet warna untuk Region
colors = {'North': '#1b9e77', # Hijau tua
'South': '#d95f02', # Oranye
'East': '#7570b3', # Ungu
'West': '#e7298a'} # Pink
# Membuat line chart
plt.figure(figsize=(10, 6))
for region in data_agg['Region'].unique():
subset = data_agg[data_agg['Region'] == region]
plt.plot(subset['YearMonth'],
subset['Total_Price'],
marker='o',
linewidth=2,
markersize=5,
color=colors[region],
label=region)
# Menambahkan judul dan label sumbu
plt.title('Tren Rata-rata Total Harga per Bulan berdasarkan Wilayah', fontsize=14, pad=10)
plt.xlabel('Bulan', fontsize=12)
plt.ylabel('Rata-rata Total Harga', fontsize=12)
# Menyesuaikan tampilan grafik
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(title='Wilayah')
plt.tight_layout()
plt.show()
R Code
# Memuat library yang diperlukan
library(ggplot2)
library(dplyr)
library(lubridate)
# Membaca dataset
data <- read.csv("data/data-dummy.csv")
# Mengonversi Transaction_Date ke format tanggal dan mengekstrak tahun-bulan
data <- data %>%
mutate(Date = as.Date(Transaction_Date, format = "%Y-%m-%d"),
YearMonth = floor_date(Date, "month")) %>%
filter(!is.na(Total_Price) & Total_Price > 0) # Memfilter data yang valid
# Mengelompokkan data berdasarkan YearMonth dan Region, menghitung rata-rata Total_Price
data_agg <- data %>%
group_by(YearMonth, Region) %>%
summarise(Avg_Total_Price = mean(Total_Price, na.rm = TRUE), .groups = "drop")
# Membuat line chart
ggplot(data_agg, aes(x = YearMonth, y = Avg_Total_Price, color = Region)) +
geom_line(size = 1.2) +
geom_point(size = 2) + # Menambahkan titik pada setiap data
scale_color_manual(values = c("North" = "#1b9e77", # Hijau tua
"South" = "#d95f02", # Oranye
"East" = "#7570b3", # Ungu
"West" = "#e7298a")) + # Pink
labs(title = "Tren Rata-rata Total Harga per Bulan berdasarkan Wilayah",
x = "Bulan",
y = "Rata-rata Total Harga",
color = "Wilayah") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
legend.position = "right"
)
1.4.2 Area Chart
Python Code
import pandas as pd
import matplotlib.pyplot as plt
# Membaca dataset
data = pd.read_csv("data/data-dummy.csv")
# Mengonversi Transaction_Date ke format tanggal dan mengekstrak tahun
data['Date'] = pd.to_datetime(data['Transaction_Date'])
data['Year'] = data['Date'].dt.year
# Memfilter data yang valid
data = data[(data['Unit_Price'] > 0) & (data['Unit_Price'].notna())]
# Mengelompokkan data berdasarkan Year dan Sales_Channel, menghitung rata-rata Unit_Price
data_agg = data.groupby(['Year', 'Sales_Channel'])['Unit_Price'].mean().unstack().fillna(0)
data_agg = data_agg.loc[2020:2024] # Membatasi ke tahun 2020–2024
# Membuat palet warna untuk Sales_Channel
colors = {'Online': '#1f77b4', # Biru tua
'Offline': '#ff7f0e'} # Oranye
# Membuat stacked area chart
plt.figure(figsize=(10, 6))
plt.stackplot(data_agg.index,
data_agg.T,
labels=data_agg.columns,
colors=[colors[cat] for cat in data_agg.columns],
alpha=0.8)
# Menambahkan judul dan label sumbu
plt.title('Rata-rata Harga per Unit per Tahun berdasarkan Saluran Penjualan', fontsize=14, pad=10)
plt.xlabel('Tahun', fontsize=12)
plt.ylabel('Rata-rata Harga per Unit', fontsize=12)
# Menyesuaikan tampilan grafik
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(title='Saluran Penjualan', loc='upper center')
plt.tight_layout()
# Menampilkan grafik
plt.show()
R Code
# Memuat library yang diperlukan
library(ggplot2)
library(dplyr)
library(lubridate)
# Membaca dataset
data <- read.csv("data/data-dummy.csv")
# Mengonversi Transaction_Date ke format tanggal dan mengekstrak tahun
data <- data %>%
mutate(Date = as.Date(Transaction_Date, format = "%Y-%m-%d"),
Year = year(Date)) %>%
filter(!is.na(Unit_Price) & Unit_Price > 0) # Memfilter data yang valid
# Mengelompokkan data berdasarkan Year dan Sales_Channel, menghitung rata-rata Unit_Price
data_agg <- data %>%
group_by(Year, Sales_Channel) %>%
summarise(Avg_Unit_Price = mean(Unit_Price, na.rm = TRUE), .groups = "drop") %>%
filter(Year >= 2020 & Year <= 2024) # Membatasi ke tahun 2020–2024
# Membuat stacked area chart
ggplot(data_agg, aes(x = Year, y = Avg_Unit_Price, fill = Sales_Channel)) +
geom_area(position = "stack", alpha = 0.8) + # Stacked area dengan transparansi
scale_fill_manual(values = c("Online" = "#1f77b4", # Biru tua
"Offline" = "#ff7f0e")) + # Oranye
labs(title = "Rata-rata Harga per Unit per Tahun berdasarkan Saluran Penjualan",
x = "Tahun",
y = "Rata-rata Harga per Unit",
fill = "Saluran Penjualan") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
legend.position = "top"
)