Visualisasi Deskriptif

Pemrograman Sains Data I

Numerical Data

Boxplot

R Code (Boxplot)

# ==============================
# 1. Load Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Load and Prepare Data
# ==============================
data_bisnis <- read.csv("data_bisnis.csv", stringsAsFactors = FALSE)

# Convert Quantity to numeric and filter missing
data_bisnis <- data_bisnis %>%
  mutate(Quantity = as.numeric(Quantity)) %>%
  filter(!is.na(Quantity))

# Compute IQR-based outlier bounds
Q1 <- quantile(data_bisnis$Quantity, 0.25)
Q3 <- quantile(data_bisnis$Quantity, 0.75)
IQR_value <- IQR(data_bisnis$Quantity)
lower_whisker <- Q1 - 1.5 * IQR_value
upper_whisker <- Q3 + 1.5 * IQR_value

# ==============================
# 3. Summarize Statistics
# ==============================
stats <- data_bisnis %>%
  summarise(
    Mean = mean(Quantity),
    Q1 = Q1,
    Median = median(Quantity),
    Q3 = Q3,
    Min = min(Quantity),
    Max = max(Quantity),
    Outliers = sum(Quantity < lower_whisker | Quantity > upper_whisker)
  )

# ==============================
# 4. Basic Boxplot with Jitter and Annotations
# ==============================
ggplot(data_bisnis, aes(x = factor(1), y = Quantity)) +
  # Basic boxplot
  geom_boxplot(fill = "skyblue", outlier.shape = NA) +
  
  # Add jittered points, highlight outliers in red
  geom_jitter(aes(color = Quantity < lower_whisker | Quantity > upper_whisker),
              width = 0.1, size = 1, alpha = 0.5) +
  scale_color_manual(values = c("FALSE" = "black", "TRUE" = "red"), guide = "none") +
  
  # Highlight max point if not an outlier
  geom_point(data = data_bisnis %>% filter(Quantity == stats$Max[[1]] & Quantity <= upper_whisker),
             aes(x = factor(1), y = Quantity),
             color = "red", size = 8) +
  
  # Annotations
  ggplot2::annotate("text", x = 1.2, y = stats$Mean[[1]], 
           label = paste("Mean:", round(stats$Mean[[1]], 2)), 
           hjust = 0, fontface = "bold", color = "blue") +
  ggplot2::annotate("text", x = 1.2, y = stats$Q1[[1]], 
           label = paste("Q1:", round(stats$Q1[[1]], 2)), 
           hjust = 0, color = "darkgreen") +
  ggplot2::annotate("text", x = 1.2, y = stats$Median[[1]], 
           label = paste("Median:", round(stats$Median[[1]], 2)), 
           hjust = 0, color = "purple") +
  ggplot2::annotate("text", x = 1.2, y = stats$Q3[[1]], 
           label = paste("Q3:", round(stats$Q3[[1]], 2)), 
           hjust = 0, color = "darkgreen") +
  ggplot2::annotate("text", x = 1.2, y = stats$Min[[1]], 
           label = paste("Min:", round(stats$Min[[1]], 2)), 
           hjust = 0, color = "orange") +
  ggplot2::annotate("text", x = 1.2, y = stats$Max[[1]], 
           label = paste("Max:", round(stats$Max[[1]], 2)), 
           hjust = 0, color = "orange") +
  ggplot2::annotate("text", x = 1, y = stats$Max[[1]] + 0.05 * stats$Max[[1]], 
           label = paste("Outliers:", stats$Outliers[[1]]), 
           color = "red", fontface = "italic", hjust = 0.5) +

  # Plot formatting
  labs(
    title = "Boxplot of Quantity with Jitter and Annotations",
    x = NULL,
    y = "Quantity"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.title = element_text(size = 20, face = "bold"),
    axis.title = element_text(size = 15),
    axis.text = element_text(size = 10)
  )

Python Code (Boxplot)

# ==============================
# 1. Load Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# ==============================
# 2. Load and Prepare Data
# ==============================
data_bisnis = pd.read_csv("data_bisnis.csv")

# Convert Quantity to numeric and filter missing
data_bisnis['Quantity'] = pd.to_numeric(data_bisnis['Quantity'], errors='coerce')
data_bisnis = data_bisnis.dropna(subset=['Quantity'])

# Compute IQR-based outlier bounds
Q1 = data_bisnis['Quantity'].quantile(0.25)
Q3 = data_bisnis['Quantity'].quantile(0.75)
IQR_value = Q3 - Q1
lower_whisker = Q1 - 1.5 * IQR_value
upper_whisker = Q3 + 1.5 * IQR_value

# ==============================
# 3. Summarize Statistics
# ==============================
stats = {
    'Mean': data_bisnis['Quantity'].mean(),
    'Q1': Q1,
    'Median': data_bisnis['Quantity'].median(),
    'Q3': Q3,
    'Min': data_bisnis['Quantity'].min(),
    'Max': data_bisnis['Quantity'].max(),
    'Outliers': ((data_bisnis['Quantity'] < lower_whisker) | (data_bisnis['Quantity'] > upper_whisker)).sum()
}

# ==============================
# 4. Basic Boxplot with Jitter and Annotations
# ==============================
plt.figure(figsize=(10, 8))
sns.boxplot(x=[""] * len(data_bisnis), y=data_bisnis['Quantity'], color='skyblue', showfliers=False)

# Add jitter points
is_outlier = (data_bisnis['Quantity'] < lower_whisker) | (data_bisnis['Quantity'] > upper_whisker)
sns.stripplot(x=[""] * len(data_bisnis), y=data_bisnis['Quantity'], 
              hue=is_outlier, palette={False: "black", True: "red"}, 
              dodge=False, jitter=0.1, size=5, alpha=0.5)
plt.legend([],[], frameon=False)

# Highlight max point if not an outlier
if stats['Max'] <= upper_whisker:
    plt.scatter(0, stats['Max'], color='red', s=200, zorder=3)

# Annotations
plt.text(0.1, stats['Mean'], f"Mean: {stats['Mean']:.2f}", color="blue", weight='bold')
plt.text(0.1, stats['Q1'], f"Q1: {stats['Q1']:.2f}", color="darkgreen")
plt.text(0.1, stats['Median'], f"Median: {stats['Median']:.2f}", color="purple")
plt.text(0.1, stats['Q3'], f"Q3: {stats['Q3']:.2f}", color="darkgreen")
plt.text(0.1, stats['Min'], f"Min: {stats['Min']:.2f}", color="orange")
plt.text(0.1, stats['Max'], f"Max: {stats['Max']:.2f}", color="orange")
plt.text(0, stats['Max'] * 1.05, f"Outliers: {stats['Outliers']}", color="red", style='italic', ha='center')

# Plot formatting
plt.title("Boxplot of Quantity with Jitter and Annotations", fontsize=20, weight='bold')
plt.ylabel("Quantity", fontsize=16)
plt.xticks([])
## ([], [])
plt.tick_params(axis='y', labelsize=14)
plt.tight_layout()
plt.show()

Violin-plot

R Code (Violin-plot)

# ==============================
# 1. Load Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Load and Prepare Data
# ==============================
data_bisnis <- read.csv("data_bisnis.csv", stringsAsFactors = FALSE)

# Clean and convert Quantity to numeric
data_bisnis <- data_bisnis %>%
  mutate(Quantity = as.numeric(Quantity)) %>%
  filter(!is.na(Quantity))

# Calculate quartiles and IQR for outlier detection
Q1 <- quantile(data_bisnis$Quantity, 0.25)
Q3 <- quantile(data_bisnis$Quantity, 0.75)
IQR_value <- IQR(data_bisnis$Quantity)
upper_whisker <- Q3 + 1.5 * IQR_value
lower_whisker <- Q1 - 1.5 * IQR_value

# Mark outliers
data_bisnis <- data_bisnis %>%
  mutate(
    is_outlier = ifelse(Quantity < lower_whisker | Quantity > upper_whisker, "Outlier", "Normal")
  )

# ==============================
# 3. Summarize Statistics
# ==============================
stats <- data_bisnis %>%
  summarise(
    Mean = mean(Quantity),
    Q1 = Q1,
    Median = median(Quantity),
    Q3 = Q3,
    Min = min(Quantity),
    Max = max(Quantity),
    Outliers = sum(is_outlier == "Outlier")
  )

# ==============================
# 4. Create Violin Plot with Colored Jitter and Annotations
# ==============================
ggplot(data_bisnis, aes(x = factor(1), y = Quantity)) +
  geom_violin(fill = "skyblue", trim = FALSE) +
  geom_boxplot(width = 0.1, outlier.shape = NA, color = "black") +
  geom_jitter(aes(color = is_outlier), width = 0.1, alpha = 0.6, size = 2) +
  geom_point(data = data_bisnis %>%
               filter(Quantity == stats$Max[[1]] & Quantity <= upper_whisker),
             aes(x = factor(1), y = Quantity),
             color = "red", size = 8) +

  # Annotations via geom_text
  geom_text(data = stats, aes(x = 1.2, y = Mean, label = paste("Mean:", round(Mean, 2))),
            hjust = 0, color = "blue", fontface = "bold") +
  geom_text(data = stats, aes(x = 1.2, y = Q1, label = paste("Q1:", round(Q1, 2))),
            hjust = 0, color = "darkgreen") +
  geom_text(data = stats, aes(x = 1.2, y = Median, label = paste("Median:", round(Median, 2))),
            hjust = 0, color = "purple") +
  geom_text(data = stats, aes(x = 1.2, y = Q3, label = paste("Q3:", round(Q3, 2))),
            hjust = 0, color = "darkgreen") +
  geom_text(data = stats, aes(x = 1.2, y = Min, label = paste("Min:", round(Min, 2))),
            hjust = 0, color = "orange") +
  geom_text(data = stats, aes(x = 1.2, y = Max, label = paste("Max:", round(Max, 2))),
            hjust = 0, color = "orange") +
  geom_text(data = stats, aes(x = 1, y = Max + 0.05 * Max,
                             label = paste("Outliers:", Outliers)),
            color = "red", fontface = "italic", hjust = 0.5) +

  scale_color_manual(values = c("Normal" = "black", "Outlier" = "red")) +

  labs(
    title = "Violin Plot of Quantity with Outlier Highlighted",
    x = NULL,
    y = "Quantity",
    color = "Point Type"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.title = element_text(size = 20, face = "bold"),
    axis.title = element_text(size = 15),
    axis.text = element_text(size = 15),
    legend.position = "right",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 9)
  )

Python Code (Violin-plot)

# ==============================
# 1. Load Libraries
# ==============================
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ==============================
# 2. Load and Prepare Data
# ==============================
# Load CSV
data_bisnis = pd.read_csv("data_bisnis.csv")

# Convert Quantity to numeric and drop NA
data_bisnis['Quantity'] = pd.to_numeric(data_bisnis['Quantity'], errors='coerce')
data_bisnis = data_bisnis.dropna(subset=['Quantity'])

# Calculate quartiles and IQR
Q1 = data_bisnis['Quantity'].quantile(0.25)
Q3 = data_bisnis['Quantity'].quantile(0.75)
IQR_value = Q3 - Q1
lower_whisker = Q1 - 1.5 * IQR_value
upper_whisker = Q3 + 1.5 * IQR_value

# Label outliers
data_bisnis['is_outlier'] = data_bisnis['Quantity'].apply(
    lambda x: 'Outlier' if x < lower_whisker or x > upper_whisker else 'Normal'
)

# ==============================
# 3. Summarize Statistics
# ==============================
stats = {
    'Mean': data_bisnis['Quantity'].mean(),
    'Q1': Q1,
    'Median': data_bisnis['Quantity'].median(),
    'Q3': Q3,
    'Min': data_bisnis['Quantity'].min(),
    'Max': data_bisnis['Quantity'].max(),
    'Outliers': (data_bisnis['is_outlier'] == 'Outlier').sum()
}

# ==============================
# 4. Create Violin Plot with Colored Jitter and Annotations
# ==============================
plt.figure(figsize=(10, 8))

# Violin plot
sns.violinplot(
    x=[""] * len(data_bisnis),
    y=data_bisnis['Quantity'],
    inner=None,
    color="skyblue"
)

# Boxplot overlay
sns.boxplot(
    x=[""] * len(data_bisnis),
    y=data_bisnis['Quantity'],
    width=0.1,
    showcaps=True,
    boxprops={'facecolor': 'white', 'edgecolor': 'black'},
    whiskerprops={'color': 'black'},
    medianprops={'color': 'black'},
    flierprops={'marker': None}
)

# Jittered points with outlier color
sns.stripplot(
    x=[""] * len(data_bisnis),
    y=data_bisnis['Quantity'],
    hue=data_bisnis['is_outlier'],
    palette={"Normal": "black", "Outlier": "red"},
    dodge=False,
    jitter=0.1,
    alpha=0.6,
    size=4
)

# Highlight maximum point (if not outlier)
if stats['Max'] <= upper_whisker:
    plt.scatter(
        0,
        stats['Max'],
        color='red',
        s=200,
        zorder=3
    )

# ==============================
# 5. Annotations
# ==============================
plt.text(0.1, stats['Mean'],   f"Mean: {stats['Mean']:.2f}",   color="blue",     weight='bold')
plt.text(0.1, stats['Q1'],     f"Q1: {stats['Q1']:.2f}",       color="darkgreen")
plt.text(0.1, stats['Median'], f"Median: {stats['Median']:.2f}", color="purple")
plt.text(0.1, stats['Q3'],     f"Q3: {stats['Q3']:.2f}",       color="darkgreen")
plt.text(0.1, stats['Min'],    f"Min: {stats['Min']:.2f}",     color="orange")
plt.text(0.1, stats['Max'],    f"Max: {stats['Max']:.2f}",     color="orange")
plt.text(0, stats['Max'] * 1.05, f"Outliers: {stats['Outliers']}", color="red", style='italic', ha='center')

# ==============================
# 6. Plot Formatting
# ==============================
plt.title("Violin Plot of Quantity with Outlier Highlighted", fontsize=20, weight='bold')
plt.ylabel("Quantity", fontsize=16)
plt.xticks([])
## ([], [])
plt.tick_params(axis='y', labelsize=14)
plt.legend(title="Point Type", title_fontsize=10, fontsize=9, loc="upper right")
plt.tight_layout()
plt.show()

Combo

Grouped Bar Chart

R Code (Grouped Bar-chart)

# ==============================
# 1. Load Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Load Data
# ==============================
data_bisnis <- read.csv("data_bisnis.csv", stringsAsFactors = FALSE)

# ==============================
# 3. Data Summarization
# ==============================
sales_summary <- data_bisnis %>%
  group_by(Product_Category, Region) %>%
  summarise(Total_Sales = sum(Total_Price, na.rm = TRUE), .groups = "drop")

# ==============================
# 4. Plot Grouped Bar Chart
# ==============================
ggplot(sales_summary, aes(x = Product_Category, y = Total_Sales, fill = Region)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(
    title = "Total Sales by Product Category and Region",
    x = "Product Category",
    y = "Total Sales (USD)",
    fill = "Region"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    axis.text.x = element_text(angle = 25, hjust = 1),
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

Python Code (Grouped Bar-chart)

# ==============================
# 1. Load Libraries
# ==============================
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ==============================
# 2. Load Data
# ==============================
data_bisnis = pd.read_csv("data_bisnis.csv")

# ==============================
# 3. Data Summarization
# ==============================
sales_summary = (
    data_bisnis
    .groupby(['Product_Category', 'Region'], as_index=False)
    .agg(Total_Sales=('Total_Price', 'sum'))
)

# ==============================
# 4. Plot Grouped Bar Chart
# ==============================
plt.figure(figsize=(10, 6))
sns.set_theme(style="whitegrid")

barplot = sns.barplot(
    data=sales_summary,
    x="Product_Category",
    y="Total_Sales",
    hue="Region",
    dodge=True,
    palette="Set2"
)

# Tambahkan label dan title
barplot.set_title("Total Sales by Product Category and Region", fontsize=14, fontweight='bold')
barplot.set_xlabel("Product Category", fontsize=12)
barplot.set_ylabel("Total Sales (USD)", fontsize=12)

# Rotasi label sumbu x
plt.xticks(rotation=25, ha='right')
## ([0, 1, 2, 3, 4], [Text(0, 0, 'Books'), Text(1, 0, 'Clothing'), Text(2, 0, 'Electronics'), Text(3, 0, 'Groceries'), Text(4, 0, 'Home')])
# Perbaiki legend
plt.legend(title="Region", title_fontsize=10, fontsize=9)

plt.tight_layout()
plt.show()

Ridgeline Plot

R Code (Ridgeline Plot)

# ==============================
# 1. Load Libraries
# ==============================
library(ggridges)
library(ggplot2)
library(dplyr)
library(scales)

# ==============================
# 2. Filter Valid Data
# ==============================
# Filter out rows where Price_per_Unit is NA, Inf, or NaN
data_bisnis <- read.csv("data_bisnis.csv", stringsAsFactors = FALSE)

data_bisnis_filtered <- data_bisnis %>%
  filter(is.finite(Price_per_Unit))

# ==============================
# 3. Create Ridgeline Plot
# ==============================
ggplot(data_bisnis_filtered, aes(x = Price_per_Unit, y = Region, fill = Region)) +
  geom_density_ridges(alpha = 0.7, scale = 1.2) +
  scale_x_continuous(labels = dollar_format(prefix = "Rp", big.mark = ".", decimal.mark = ",")) +
  labs(
    title = "Distribution of Price per Unit by Region",
    x = "Price per Unit",
    y = "Region"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

Python Code (Ridgeline Plot)

# ==============================
# 1. Load Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from joypy import joyplot
import numpy as np
import matplotlib.ticker as ticker

# ==============================
# 2. Load and Filter Valid Data
# ==============================
data_bisnis = pd.read_csv("data_bisnis.csv")

# Filter: hanya baris dengan Price_per_Unit yang finite
data_bisnis_filtered = data_bisnis[np.isfinite(data_bisnis["Price_per_Unit"])]

# ==============================
# 3. Create Ridgeline Plot
# ==============================
plt.figure(figsize=(10, 6))

# Joyplot (Ridgeline)
fig, axes = joyplot(
    data_bisnis_filtered,
    by="Region",
    column="Price_per_Unit",
    colormap=plt.cm.Set3,
    alpha=0.7,
    overlap=1.2,
    linewidth=1,
    fade=True
)

# Formatting x-axis as Indonesian-style currency (Rp)
axes[-1].xaxis.set_major_formatter(ticker.FuncFormatter(
    lambda x, pos: f'Rp{x:,.0f}'.replace(',', '.')
))

# Title and labels
plt.title("Distribution of Price per Unit by Region", fontsize=14, weight='bold')
plt.xlabel("Price per Unit", fontsize=10)
plt.ylabel("Region", fontsize=10)

plt.tight_layout()
plt.show()

Boxplot by Category

R Code (Boxplot by Category)

# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Prepare Data
# ==============================
# Convert Quantity to numeric and remove NA
data_bisnis <- read.csv("data_bisnis.csv", stringsAsFactors = FALSE)
data_bisnis <- data_bisnis %>%
  mutate(Quantity = as.numeric(Quantity)) %>%
  filter(!is.na(Quantity))

# ==============================
# 3. Create Boxplot
# ==============================
ggplot(data_bisnis, aes(x = Product_Category, y = Quantity, fill = Product_Category)) +
  geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 2) +  # Boxplot with red outliers
  labs(
    title = "Boxplot of Quantity by Product Category",
    x = "Product Category",
    y = "Quantity"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10),
    legend.position = "none"
  )

Python Code (Boxplot by Category)

# ==============================
# 1. Load Required Libraries
# ==============================
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ==============================
# 2. Prepare Data
# ==============================
# Load data and preprocess
data_bisnis = pd.read_csv("data_bisnis.csv")
data_bisnis['Quantity'] = pd.to_numeric(data_bisnis['Quantity'], errors='coerce')
data_bisnis = data_bisnis.dropna(subset=['Quantity'])

# Sort categories alphabetically for consistent x-axis ordering
sorted_categories = sorted(data_bisnis['Product_Category'].dropna().unique())
data_bisnis['Product_Category'] = pd.Categorical(data_bisnis['Product_Category'], categories=sorted_categories, ordered=True)

# ==============================
# 3. Create Boxplot
# ==============================
plt.figure(figsize=(10, 6))
sns.boxplot(
    x='Product_Category',
    y='Quantity',
    data=data_bisnis,
    palette='pastel',
    showfliers=True,
    flierprops=dict(marker='o', markerfacecolor='red', markersize=5, linestyle='none')
)

plt.title("Boxplot of Quantity by Product Category", fontsize=14, fontweight='bold')
plt.xlabel("Product Category", fontsize=12)
plt.ylabel("Quantity", fontsize=12)
plt.xticks(fontsize=10)
## ([0, 1, 2, 3, 4], [Text(0, 0, 'Books'), Text(1, 0, 'Clothing'), Text(2, 0, 'Electronics'), Text(3, 0, 'Groceries'), Text(4, 0, 'Home')])
plt.yticks(fontsize=10)
## (array([-2.,  0.,  2.,  4.,  6.,  8., 10.]), [Text(0, -2.0, '−2'), Text(0, 0.0, '0'), Text(0, 2.0, '2'), Text(0, 4.0, '4'), Text(0, 6.0, '6'), Text(0, 8.0, '8'), Text(0, 10.0, '10')])
sns.despine()  # equivalent to theme_minimal
plt.tight_layout()
plt.show()

Lollipop Chart

R Code (Lollipop Chart)

# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis <- read.csv("data_bisnis.csv", stringsAsFactors = FALSE)

# Summarize total sales by Product_Category and Region
sales_grouped <- data_bisnis %>%
  group_by(Product_Category, Region) %>%
  summarise(Total_Sales = sum(Total_Price, na.rm = TRUE), .groups = "drop")

# ==============================
# 3. Grouped Lollipop Chart
# ==============================
ggplot(sales_grouped, aes(x = Total_Sales, y = reorder(Product_Category, Total_Sales), color = Region)) +
  geom_segment(aes(x = 0, xend = Total_Sales, y = Product_Category, yend = Product_Category), size = 1) +
  geom_point(size = 4) +
  labs(
    title = "Grouped Lollipop Chart",
    x = "Total Sales",
    y = "Product Category"
  ) +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold")
  )

# ==============================
# 4. Faceted Lollipop Chart
# ==============================
ggplot(sales_grouped, aes(x = Total_Sales, y = reorder(Product_Category, Total_Sales))) +
  geom_segment(aes(x = 0, xend = Total_Sales, y = Product_Category, yend = Product_Category), color = "skyblue", size = 1) +
  geom_point(color = "blue", size = 4) +
  facet_wrap(~ Region, scales = "free_x") +
  labs(
    title = "Faceted Lollipop Chart",
    x = "Total Sales",
    y = "Product Category"
  ) +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold")
  )

Python Code (Lollipop Chart)

# ==============================
# 1. Load Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Optional for theme style
sns.set_theme(style="whitegrid")

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis = pd.read_csv("data_bisnis.csv")

# Summarize total sales by Product_Category and Region
sales_grouped = (
    data_bisnis
    .groupby(["Product_Category", "Region"], as_index=False)
    .agg(Total_Sales=("Total_Price", "sum"))
)

# ==============================
# 3. Grouped Lollipop Chart
# ==============================
plt.figure(figsize=(10, 6))

# Sort to get consistent y-axis
sales_grouped_sorted = sales_grouped.sort_values("Total_Sales")
categories = sales_grouped_sorted["Product_Category"].unique()

# Plot
for region in sales_grouped["Region"].unique():
    region_data = sales_grouped[sales_grouped["Region"] == region]
    region_data = region_data.sort_values("Total_Sales")
    plt.hlines(y=region_data["Product_Category"], xmin=0, xmax=region_data["Total_Sales"], label=region, linewidth=2)
    plt.plot(region_data["Total_Sales"], region_data["Product_Category"], 'o', markersize=8, label=f"{region}")

plt.title("Grouped Lollipop Chart", fontsize=16, fontweight='bold')
plt.xlabel("Total Sales", fontsize=14)
plt.ylabel("Product Category", fontsize=14)
plt.xticks(fontsize=12)
## (array([-200.,    0.,  200.,  400.,  600.,  800., 1000., 1200., 1400.,
##        1600.]), [Text(-200.0, 0, '−200'), Text(0.0, 0, '0'), Text(200.0, 0, '200'), Text(400.0, 0, '400'), Text(600.0, 0, '600'), Text(800.0, 0, '800'), Text(1000.0, 0, '1000'), Text(1200.0, 0, '1200'), Text(1400.0, 0, '1400'), Text(1600.0, 0, '1600')])
plt.yticks(fontsize=12)
## ([0, 1, 2, 3, 4], [Text(0, 0, 'Groceries'), Text(0, 1, 'Books'), Text(0, 2, 'Electronics'), Text(0, 3, 'Home'), Text(0, 4, 'Clothing')])
plt.legend(title="Region")
plt.tight_layout()
plt.show()

# ==============================
# 4. Faceted Lollipop Chart (2x2 Layout)
# ==============================
# Create FacetGrid with 2 columns per row
g = sns.FacetGrid(sales_grouped, col="Region", col_wrap=2, sharex=False, height=5, aspect=1.2)

# Define plotting function
def lollipop(data, **kwargs):
    data = data.sort_values("Total_Sales")
    plt.hlines(y=data["Product_Category"], xmin=0, xmax=data["Total_Sales"], color="skyblue", linewidth=2)
    plt.plot(data["Total_Sales"], data["Product_Category"], 'o', color="blue", markersize=8)

# Map to each facet
g.map_dataframe(lollipop)

g.set_axis_labels("Total Sales", "Product Category")

g.set_titles(col_template="{col_name}")

g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Faceted Lollipop Chart", fontsize=16, fontweight='bold')
plt.show()

Heatmap

R Code (Heatmap)

# ==============================
# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)
library(readr)
library(forcats)  # Untuk fct_rev()

# ==============================
# 2. Load and Prepare Data
# ==============================
data_bisnis <- read_csv("data_bisnis.csv")

# Hitung total sales berdasarkan Region dan Product_Category
heatmap_data <- data_bisnis %>%
  group_by(Region, Product_Category) %>%
  summarise(Total_Sales = sum(Total_Price, na.rm = TRUE), .groups = "drop") %>%
  mutate(Product_Category = fct_rev(factor(Product_Category)))  # Urutkan alfabet terbalik

# ==============================
# 3. Create Heatmap with Labels
# ==============================
ggplot(heatmap_data, aes(x = Region, y = Product_Category, fill = Total_Sales)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(Total_Sales, 0)), color = "black", size = 4) +
  scale_fill_gradient(low = "#ffe5e5", high = "#990000", name = "Total Sales") +
  labs(
    title = "Heatmap of Total Sales by Region and Product Category",
    x = "Region",
    y = "Product Category"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 25, hjust = 1),
    plot.title = element_text(size = 16, face = "bold"),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )

Python Code (Heatmap)

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ==============================
# 1. Load and Prepare Data
# ==============================
df = pd.read_csv('data_bisnis.csv')

# Hitung total sales per kategori dan region
heatmap_data = df.groupby(['Product_Category', 'Region'])['Total_Price'].sum().reset_index()

# Pivot data menjadi format matriks
heatmap_pivot = heatmap_data.pivot(index='Product_Category', columns='Region', values='Total_Price')

# ==============================
# 2. Plot Heatmap
# ==============================
plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_pivot, cmap='Reds', linewidths=0.5, annot=True, fmt=".0f")

plt.title('Heatmap of Total Sales by Region and Product Category', fontsize=16, fontweight='bold')
plt.xlabel('Region', fontsize=14)
plt.ylabel('Product Category', fontsize=14)
plt.xticks(rotation=25)
## (array([0.5, 1.5, 2.5, 3.5]), [Text(0.5, 0, 'East'), Text(1.5, 0, 'North'), Text(2.5, 0, 'South'), Text(3.5, 0, 'West')])
plt.yticks(rotation=0)
## (array([0.5, 1.5, 2.5, 3.5, 4.5]), [Text(0, 0.5, 'Books'), Text(0, 1.5, 'Clothing'), Text(0, 2.5, 'Electronics'), Text(0, 3.5, 'Groceries'), Text(0, 4.5, 'Home')])
plt.tight_layout()
plt.show()

Relationship

Scatter Plot

R Code (Scatter Plot)

# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)
library(readr)

# ==============================
# 2. Load and Prepare Data
# ==============================
data_bisnis <- read_csv("data_bisnis.csv")

# Pastikan Quantity dan Total_Price bertipe numerik dan tidak NA
data_bisnis <- data_bisnis %>%
  mutate(
    Quantity = as.numeric(Quantity),
    Total_Price = as.numeric(Total_Price)
  ) %>%
  filter(!is.na(Quantity), !is.na(Total_Price))

# ==============================
# 3. Create Scatter Plot
# ==============================
ggplot(data_bisnis, aes(x = Quantity, y = Total_Price, color = Product_Category)) +
  geom_point(alpha = 0.7, size = 3) +
  labs(
    title = "Scatter Plot of Total Price vs Quantity",
    x = "Quantity",
    y = "Total Price",
    color = "Product Category"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  )

Python Code (Scatter Plot)

# ==============================
# 1. Load Required Libraries
# ==============================
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ==============================
# 2. Load and Prepare Data
# ==============================
df = pd.read_csv("data_bisnis.csv")

# Pastikan kolom numerik benar
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')
df['Total_Price'] = pd.to_numeric(df['Total_Price'], errors='coerce')
df = df.dropna(subset=['Quantity', 'Total_Price'])

# ==============================
# 3. Create Scatter Plot
# ==============================
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=df,
    x='Quantity',
    y='Total_Price',
    hue='Product_Category',
    palette='Set2',
    alpha=0.7,
    s=80
)

plt.title("Scatter Plot of Total Price vs Quantity", fontsize=16, weight='bold')
plt.xlabel("Quantity", fontsize=14)
plt.ylabel("Total Price", fontsize=14)
plt.xticks(fontsize=12)
## (array([-2.,  0.,  2.,  4.,  6.,  8., 10.]), [Text(-2.0, 0, '−2'), Text(0.0, 0, '0'), Text(2.0, 0, '2'), Text(4.0, 0, '4'), Text(6.0, 0, '6'), Text(8.0, 0, '8'), Text(10.0, 0, '10')])
plt.yticks(fontsize=12)
## (array([-25.,   0.,  25.,  50.,  75., 100., 125., 150., 175., 200.]), [Text(0, -25.0, '−25'), Text(0, 0.0, '0'), Text(0, 25.0, '25'), Text(0, 50.0, '50'), Text(0, 75.0, '75'), Text(0, 100.0, '100'), Text(0, 125.0, '125'), Text(0, 150.0, '150'), Text(0, 175.0, '175'), Text(0, 200.0, '200')])
plt.legend(title='Product Category', fontsize=10, title_fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()

Bubble Chart

R Code (Bubble Chart)

# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)
library(readr)

# ==============================
# 2. Load and Prepare Data
# ==============================
data_bisnis <- read_csv("data_bisnis.csv")

# Pastikan kolom numerik
data_bisnis <- data_bisnis %>%
  mutate(
    Quantity = as.numeric(Quantity),
    Unit_Price = as.numeric(Unit_Price),
    Total_Price = as.numeric(Total_Price)
  ) %>%
  filter(!is.na(Quantity) & !is.na(Unit_Price) & !is.na(Total_Price) & !is.na(Product_Category))

# ==============================
# 3. Bubble Chart
# ==============================
ggplot(data_bisnis, aes(x = Quantity, y = Unit_Price, size = Total_Price, color = Product_Category)) +
  geom_point(alpha = 0.6) +
  scale_size_continuous(range = c(2, 20)) +  # Atur ukuran bubble
  labs(
    title = "Bubble Chart of Quantity vs Unit Price",
    subtitle = "Bubble Size = Total Price",
    x = "Quantity",
    y = "Unit Price",
    size = "Total Price",
    color = "Product Category"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12)
  )

Python Code (Bubble Chart)

# ==============================
# 1. Load Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Load and Prepare Data
# ==============================
data = pd.read_csv("data_bisnis.csv")

# Pastikan kolom numerik
data['Quantity'] = pd.to_numeric(data['Quantity'], errors='coerce')
data['Unit_Price'] = pd.to_numeric(data['Unit_Price'], errors='coerce')
data['Total_Price'] = pd.to_numeric(data['Total_Price'], errors='coerce')

# Hapus baris yang mengandung NA
data = data.dropna(subset=['Quantity', 'Unit_Price', 'Total_Price', 'Product_Category'])

# ==============================
# 3. Bubble Chart
# ==============================
plt.figure(figsize=(12, 8))
bubble = sns.scatterplot(
    data=data,
    x='Quantity',
    y='Unit_Price',
    size='Total_Price',
    hue='Product_Category',
    sizes=(50, 1200),   # Ukuran gelembung serupa dengan R (2-20 dikonversi)
    alpha=0.6,
    palette='tab10',
    edgecolor='w',
    linewidth=0.5
)

# ==============================
# 4. Customize Plot
# ==============================
plt.title("Bubble Chart of Quantity vs Unit Price", fontsize=16, fontweight='bold')
plt.suptitle("Bubble Size = Total Price", fontsize=12)
plt.xlabel("Quantity", fontsize=14)
plt.ylabel("Unit Price", fontsize=14)
plt.legend(title="Product Category", title_fontsize=12, loc='best', bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.grid(True)
plt.show()

Correlation Matrix

R Code (Correlation Matrix)

# ==============================
# 1. Load Required Libraries
# ==============================
library(readr)
library(dplyr)
library(ggplot2)
library(reshape2)
library(RColorBrewer)

# ==============================
# 2. Load and Select Data
# ==============================
data <- read_csv("data_bisnis.csv")

# Pilih variabel numerik dan hilangkan NA
selected_vars <- data %>%
  select(Quantity, Unit_Price, Discount, Feature_Interaction) %>%
  na.omit()

# ==============================
# 3. Compute Correlation Matrix
# ==============================
cor_mat <- round(cor(selected_vars, use = "complete.obs"), 2)

# Ubah ke format long untuk heatmap
melted_cor <- melt(cor_mat)

# ==============================
# 4. Plot Correlation Heatmap
# ==============================
ggplot(melted_cor, aes(x = Var1, y = factor(Var2, levels = rev(colnames(cor_mat))), fill = value)) +
  geom_tile(color = "white") +
  geom_text(aes(label = value), color = "black", size = 4) +
  scale_fill_gradientn(
    colors = brewer.pal(n = 9, name = "YlGnBu"),
    limits = c(-1, 1),
    name = "Correlation"
  ) +
  coord_fixed() +
  labs(
    title = "Correlation Matrix Heatmap",
    x = NULL,
    y = NULL
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.text = element_text(size = 12),
    plot.title = element_text(size = 16, face = "bold")
  )

Python Code (Correlation Matrix)

# ==============================
# 1. Load Required Libraries
# ==============================
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ==============================
# 2. Load and Select Data
# ==============================
# Baca data dari file CSV
data = pd.read_csv("data_bisnis.csv")

# Pilih variabel numerik yang relevan
selected_vars = data[['Quantity', 'Unit_Price', 'Discount', 'Feature_Interaction']].dropna()

# ==============================
# 3. Compute Correlation Matrix
# ==============================
cor_matrix = selected_vars.corr().round(2)

# ==============================
# 4. Plot Correlation Heatmap
# ==============================
plt.figure(figsize=(8, 6))
heatmap = sns.heatmap(
    cor_matrix,          
    annot=True,                     # Tampilkan nilai korelasi
    fmt=".2f",                      # Format desimal
    cmap="YlGnBu",                  # Warna gradasi
    vmin=-1, vmax=1,                # Skala korelasi
    linewidths=0.5, linecolor='white'  # Garis antar sel
)

# ==============================
# 5. Customize Aesthetics
# ==============================
plt.title("Correlation Matrix Heatmap", fontsize=16, fontweight='bold')
plt.xticks(rotation=45, ha='right')
## (array([0.5, 1.5, 2.5, 3.5]), [Text(0.5, 0, 'Quantity'), Text(1.5, 0, 'Unit_Price'), Text(2.5, 0, 'Discount'), Text(3.5, 0, 'Feature_Interaction')])
plt.yticks(rotation=0)
## (array([0.5, 1.5, 2.5, 3.5]), [Text(0, 0.5, 'Quantity'), Text(0, 1.5, 'Unit_Price'), Text(0, 2.5, 'Discount'), Text(0, 3.5, 'Feature_Interaction')])
plt.tight_layout()

# Tampilkan plot
plt.show()

Time Series

Line Chart

R Code (Line Chart)

# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)
library(readr)
library(lubridate)
library(scales)

# ==============================
# 2. Load and Prepare Data
# ==============================
data <- read_csv("data_bisnis.csv")

# Format tanggal dan buat kolom YearMonth
data <- data %>%
  mutate(
    Transaction_Date = as.Date(Transaction_Date),
    YearMonth = floor_date(Transaction_Date, "month")
  )

# Agregasi total sales per bulan per region
monthly_region_sales <- data %>%
  group_by(YearMonth, Region) %>%
  summarise(Total_Sales = sum(Total_Price, na.rm = TRUE), .groups = "drop")

# ==============================
# 3. Plot Line Chart per Region (X: tahun, data per bulan)
# ==============================
ggplot(monthly_region_sales, aes(x = YearMonth, y = Total_Sales, color = Region)) +
  geom_line(size = 1.2, alpha = 0.9) +
  geom_point(size = 2, alpha = 0.8) +
  labs(
    title = "Tren Total Penjualan Bulanan per Region",
    x = "Tahun",
    y = "Total Penjualan",
    color = "Region"
  ) +
  scale_x_date(
    date_labels = "%Y",            # Tampilkan hanya tahun
    date_breaks = "1 year"         # Interval label 1 tahun
  ) +
  theme_minimal(base_size = 13) +
  theme(
    plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 13, face = "italic", hjust = 0.5),
    axis.text.x = element_text(angle = 0, hjust = 0.5),
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 13),
    legend.text = element_text(size = 11)
  )

Python Code (Line Chart)

# ==============================
# 1. Load Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Load and Prepare Data
# ==============================
# Baca data
data = pd.read_csv("data_bisnis.csv")

# Pastikan kolom tanggal dalam format datetime dan buat kolom YearMonth
data['Transaction_Date'] = pd.to_datetime(data['Transaction_Date'])
data['YearMonth'] = data['Transaction_Date'].dt.to_period('M').dt.to_timestamp()

# Agregasi total sales per bulan per region
monthly_region_sales = data.groupby(['YearMonth', 'Region'])['Total_Price'].sum().reset_index()

# ==============================
# 3. Plot Line Chart per Region
# ==============================
plt.figure(figsize=(12, 6))
sns.lineplot(data=monthly_region_sales, x='YearMonth', y='Total_Price', hue='Region', marker='o')

# ==============================
# 4. Customize Aesthetics
# ==============================
plt.title("Monthly Total Sales by Region", fontsize=16, fontweight='bold')
plt.xlabel("Year")
plt.ylabel("Total Sales")
plt.legend(title='Region')
plt.grid(True)

# Format x-axis: show label every January only
plt.xticks(
    ticks=[d for d in monthly_region_sales['YearMonth'].unique() if d.month == 1],
    labels=[d.strftime('%Y') for d in monthly_region_sales['YearMonth'].unique() if d.month == 1],
    rotation=0
)
## ([<matplotlib.axis.XTick object at 0x000001B442EE3A10>, <matplotlib.axis.XTick object at 0x000001B44310C350>, <matplotlib.axis.XTick object at 0x000001B443091D60>, <matplotlib.axis.XTick object at 0x000001B44F09A6F0>, <matplotlib.axis.XTick object at 0x000001B44F099A00>], [Text(18262.0, 0, '2020'), Text(18628.0, 0, '2021'), Text(18993.0, 0, '2022'), Text(19358.0, 0, '2023'), Text(19723.0, 0, '2024')])
plt.tight_layout()
plt.show()

Area Chart

R Code (Area Chart)

# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)
library(readr)
library(lubridate)
library(scales)

# ==============================
# 2. Load and Prepare Data
# ==============================
# Baca data
data <- read_csv("data_bisnis.csv")

# Format tanggal dan buat kolom YearMonth
data <- data %>%
  mutate(
    Transaction_Date = as.Date(Transaction_Date),
    YearMonth = floor_date(Transaction_Date, "month")
  )

# Agregasi total sales per bulan per region
monthly_region_sales <- data %>%
  group_by(YearMonth, Region) %>%
  summarise(Total_Sales = sum(Total_Price, na.rm = TRUE), .groups = "drop")

# ==============================
# 3. Plot Area Chart
# ==============================
ggplot(monthly_region_sales, aes(x = YearMonth, y = Total_Sales, fill = Region)) +
  geom_area(alpha = 0.8, size = 0.5, colour = "white") +
  scale_fill_brewer(palette = "Set2") +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  labs(
    title = "Monthly Total Sales by Region (Area Chart)",
    x = "Year",
    y = "Total Sales",
    fill = "Region"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.title = element_text(size = 14),
    axis.text.x = element_text(size = 11, angle = 0, vjust = 0.5),
    axis.text.y = element_text(size = 12),
    legend.title = element_text(size = 13),
    legend.text = element_text(size = 11)
  )

import pandas as pd
import matplotlib.pyplot as plt

# ==============================
# 1. Load and Prepare Data
# ==============================
data = pd.read_csv("data_bisnis.csv")
data['Transaction_Date'] = pd.to_datetime(data['Transaction_Date'])
data['YearMonth'] = data['Transaction_Date'].dt.to_period('M').dt.to_timestamp()

# Agregasi total sales per bulan per region
monthly_sales = data.groupby(['YearMonth', 'Region'])['Total_Price'].sum().reset_index()

# Pivot untuk area chart
pivot_data = monthly_sales.pivot(index='YearMonth', columns='Region', values='Total_Price').fillna(0)

# ==============================
# 2. Plot Area Chart
# ==============================
plt.figure(figsize=(12, 6))
pivot_data.plot(kind='area', stacked=True, figsize=(12, 6), alpha=0.7)

plt.title("Monthly Total Sales by Region (Area Chart)", fontsize=16, fontweight='bold')
plt.xlabel("Year")
plt.ylabel("Total Sales")
plt.xticks(rotation=45)
## (array([600, 612, 624, 636, 648]), [Text(600, 0, '2020'), Text(612, 0, '2021'), Text(624, 0, '2022'), Text(636, 0, '2023'), Text(648, 0, '2024')])
plt.tight_layout()
plt.grid(True)
plt.legend(title='Region')
plt.show()

