Bab 1 VISUALISASI DESKRIPTIF

## C:\Users\MOHAMM~1\AppData\Local\Programs\Python\PYTHON~1\python.exe

1.1 Bar Chart

1.1.1 R Code (Bar Chart)

# Load required libraries
library(dplyr)        # For data manipulation

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)      # For creating the bar chart

## Warning: package 'ggplot2' was built under R version 4.4.3

library(viridis)      # For color palette

## Warning: package 'viridis' was built under R version 4.4.3

## Loading required package: viridisLite

library(scales)       # For formatting currency labels

## 
## Attaching package: 'scales'

## The following object is masked from 'package:viridis':
## 
##     viridis_pal

# Step 1: Prepare the data
data_bisnis <- read.csv("data/bab8/data_bisnis.csv")
sales_summary <- data_bisnis %>%
  group_by(Product_Category) %>%                              
  summarise(Total_Sales = sum(Total_Price, na.rm = TRUE)) %>% 
  arrange(desc(Total_Sales))                                  

# Step 2: Generate a color palette
custom_colors <- viridis::turbo(n = nrow(sales_summary))      

# Step 3: Create bar chart with value labels
ggplot(sales_summary, aes(x = reorder(Product_Category, -Total_Sales), 
                          y = Total_Sales, 
                          fill = Product_Category)) +
  geom_col(show.legend = FALSE) +                         
  geom_text(aes(label = scales::label_comma(prefix = "Rp ")(Total_Sales)),
            vjust = -0.5, size = 6) +                       
  scale_fill_manual(values = custom_colors) +             
  scale_y_continuous(labels = scales::label_comma(prefix = "Rp "),
                     expand = expansion(mult = c(0, 0.1))) + 
  labs(
    title = "Total Sales by Product Category (2020–2024)",
    subtitle = "Based on Transaction Value",
    x = "Product Category",
    y = "Total Sales",
    caption = "@Mohammad Riyadh") +
    theme_minimal(base_size = 25)

1.1.2 Python Code (Bar Chart)

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from matplotlib import cm
import numpy as np

data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")

# Step 1: Prepare the data
sales_summary = (
    data_bisnis
    .groupby('Product_Category', as_index=False)
    .agg(Total_Sales=('Total_Price', 'sum'))
    .sort_values('Total_Sales', ascending=False)
)

# Step 2: Generate color palette
num_categories = sales_summary.shape[0]
colors = cm.turbo(np.linspace(0, 1, num_categories))

# Step 3: Create figure and axis
fig, ax = plt.subplots(figsize=(12, 6))

# Step 4: Plot bar chart
bars = ax.bar(
    sales_summary['Product_Category'],
    sales_summary['Total_Sales'],
    color=colors,
    edgecolor='black',
    linewidth=0.8
)

# Step 5: Format y-axis as currency
formatter = FuncFormatter(lambda x, _: f'Rp {int(x):,}'.replace(',', '.'))
ax.yaxis.set_major_formatter(formatter)
ax.grid(axis='y', linestyle='--', alpha=0.6)

# Step 6: Axis labels and ticks
ax.set_xlabel('Product Category', fontsize=14)
ax.set_ylabel('Total Sales', fontsize=14)
plt.setp(ax.get_xticklabels(), rotation=45, ha='right', fontsize=12);  
ax.tick_params(axis='y', labelsize=12)


# Step 7: Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(
        bar.get_x() + bar.get_width() / 2,
        height + max(sales_summary['Total_Sales']) * 0.01,
        f'Rp {int(height):,}'.replace(',', '.'),
        ha='center',
        va='bottom',
        fontsize=11
    )

# Step 8: Titles
fig.suptitle('Total Sales by Product Category (2020–2024)', 
             fontsize=20, weight='bold', y=0.93)
ax.set_title('Based on Transaction Value', fontsize=16, pad=5, loc='center')

# Step 9: Credit
fig.text(0.98, 0.01, '@siregarbakti', ha='right', fontsize=16, color='gray')

# Step 10: Layout
plt.tight_layout(rect=[0, 0.03, 1, 0.92])

# Show plot once
plt.show();

1.2 Pie Chart

1.2.1 R Code (Pie Chart )

# Load necessary libraries
library(dplyr)      # For data manipulation
library(ggplot2)    # For data visualization
library(viridis)    # For color palettes
library(scales)     # For formatting percentages

# Step 1: Summarize total sales by product category
data_bisnis <- read.csv("data/bab8/data_bisnis.csv")
sales_summary <- data_bisnis %>%
  group_by(Product_Category) %>%
  summarise(Total_Sales = sum(Total_Price, na.rm = TRUE)) %>%
  arrange(desc(Total_Sales)) %>%
  mutate(
    Percentage = Total_Sales / sum(Total_Sales),# Calculate share
    Label = paste0(Product_Category, "\n",      # Create label with line break
                   scales::percent(Percentage, accuracy = 1)))

# Step 2: Create custom color palette
custom_colors <- viridis::turbo(n = nrow(sales_summary))

# Step 3: Plot donut chart
ggplot(sales_summary, aes(x =2, y = Percentage, fill = Product_Category)) +
  geom_col(width = 1, color = "white", show.legend = FALSE) + # donut slices
  coord_polar(theta = "y") +                    # Convert to circular layout
  geom_text(aes(label = Label),                 # Add labels inside slices
            position = position_stack(vjust = 0.5),
            size = 7, color = "white", fontface = "bold") +
  scale_fill_manual(values = custom_colors) +
  xlim(0.5, 2.5) +                             # Expand size of donut
  labs(
    title = "Sales Distribution by Product Category (2020–2024)",
    subtitle = "Based on Total Transaction Value",
    caption = "@Mohammad Riyadh"
  ) +
  theme_void(base_size = 30) +                 # Clean theme
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5), # Centered title
    plot.subtitle = element_text(margin = margin(t = 8, b = 20), hjust = 0.5),
    plot.caption = element_text(margin = margin(t = 15), hjust = 1.5,
                                color = "gray20", face = "italic")
  )

1.2.2 Python Code (Pie Chart )

import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np

data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")

# Ringkasan Total Sales per Product_Category
sales_summary = (
    data_bisnis
    .groupby('Product_Category', as_index=False)
    .agg(Total_Sales=('Total_Price', 'sum'))
    .sort_values('Total_Sales', ascending=False)
)

# Persentase dan label
sales_summary['Percentage'] = sales_summary['Total_Sales']/sales_summary['Total_Sales'].sum()
sales_summary['Label'] = sales_summary.apply(
    lambda row: f"{row['Product_Category']}\n{row['Percentage']:.0%}", axis=1
)

# Warna turbo
num_categories = sales_summary.shape[0]
colors = cm.get_cmap('turbo')(np.linspace(0, 1, num_categories))

## <string>:1: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed in 3.11. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap()`` or ``pyplot.get_cmap()`` instead.

# Plot donut chart
fig, ax = plt.subplots(figsize=(7, 7))
wedges, _ = ax.pie(
    sales_summary['Percentage'],
    labels=None,
    startangle=90,
    counterclock=False,
    colors=colors,
    wedgeprops=dict(width=0.5, edgecolor='white')
)

# Tambahkan label ke setiap sektor
for i, (wedge, label) in enumerate(zip(wedges, sales_summary['Label'])):
    angle = (wedge.theta2 + wedge.theta1) / 2
    x = np.cos(np.radians(angle)) * 0.7
    y = np.sin(np.radians(angle)) * 0.7
    ax.text(x, y, label, ha='center', va='center', fontsize=10, 
            color='white', weight='bold')

# Judul dan estetika
fig.suptitle('Distribusi Penjualan per Kategori Produk (2020–2024)', 
            fontsize=15, weight='bold')
ax.set_title('Berdasarkan Total Nilai Transaksi', fontsize=12, pad=10)
fig.text(0.98, 0.02, '@Mohammad Riyadh', ha='right', 
        fontsize=12, color='gray', style='italic')

# Tampilan
ax.axis('equal')  # Buat pie jadi lingkaran sempurna
plt.tight_layout()
plt.show();

1.3 Word Cloud

1.3.1 R Code (Word Cloud )

# ==============================
# 1. Install & Load Required Packages
# ==============================
packages <- c("dplyr", "tm", "wordcloud", "RColorBrewer")
new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
if(length(new_packages)) install.packages(new_packages)

library(dplyr)
library(tm)

## Warning: package 'tm' was built under R version 4.4.3

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 4.4.2

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.4.3

## Loading required package: RColorBrewer

library(RColorBrewer)

# ==============================
# 2. Read and Combine Text Columns
# ==============================
data_bisnis <- read.csv("data/bab8/data_bisnis.csv")

# Combine text columns into one
text_data <- paste(data_bisnis$Product_Category,
                   data_bisnis$Region,
                   data_bisnis$Sales_Channel,
                   sep = " ")

# ==============================
# 3. Clean and Prepare Text
# ==============================
corpus <- VCorpus(VectorSource(text_data))

corpus_clean <- corpus %>%
  tm_map(content_transformer(tolower)) %>%  # convert to lowercase
  tm_map(removePunctuation) %>%             # remove punctuation
  tm_map(removeNumbers) %>%                 # remove numbers
  tm_map(removeWords, stopwords("english")) %>%  # remove English stopwords
  tm_map(stripWhitespace)                   # remove extra whitespace

# Remove empty documents (if any)
non_empty_idx <- sapply(corpus_clean, function(doc) {
  nchar(content(doc)) > 0
})
corpus_clean <- corpus_clean[non_empty_idx]

# ==============================
# 4. Create Term-Document Matrix & Word Frequencies
# ==============================
tdm <- TermDocumentMatrix(corpus_clean)
m <- as.matrix(tdm)
word_freqs <- sort(rowSums(m), decreasing = TRUE)
df_words <- data.frame(word = names(word_freqs), freq = word_freqs)

# ==============================
# 5. Generate Word Cloud (Full Screen)
# ==============================
set.seed(123)
wordcloud(words = df_words$word,
          freq = df_words$freq,
          scale = c(4, 0.5),       # adjust for large size
          min.freq = 1,
          max.words = 300,
          random.order = FALSE,
          rot.per = 0.3,
          colors = brewer.pal(8, "Dark2"))

1.3.2 Python Code (Word Cloud )

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 1. Install and load required packages
# (Make sure nltk stopwords are downloaded)
nltk.download('stopwords')

# 2. Read and Combine Text Columns
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")

# Combine text columns into a single string per row
text_data = data_bisnis['Product_Category'].fillna('') + " " + \
            data_bisnis['Region'].fillna('') + " " + \
            data_bisnis['Sales_Channel'].fillna('')

# 3. Clean and Prepare Text - similar to tm_map pipeline in R
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()                        # tolower()
    text = re.sub(r'[^\w\s]', ' ', text)      # removePunctuation()
    text = re.sub(r'\d+', '', text)            # removeNumbers()
    text = re.sub(r'\s+', ' ', text)           # stripWhitespace()
    words = text.strip().split()
    words = [w for w in words if w not in stop_words]  # removeWords(stopwords)
    return " ".join(words)

cleaned_docs = text_data.apply(clean_text)

# Remove empty documents (like non_empty_idx in R)
cleaned_docs = cleaned_docs[cleaned_docs.str.strip() != ""]

# 4. Create Term-Document Matrix & Word Frequencies
vectorizer = CountVectorizer()
tdm = vectorizer.fit_transform(cleaned_docs)

# Sum the counts of each word over all documents
word_freqs = tdm.sum(axis=0).A1  # convert to 1D array
words = vectorizer.get_feature_names_out()

# Create dataframe like df_words in R
df_words = pd.DataFrame({'word': words, 'freq': word_freqs})
df_words = df_words.sort_values(by='freq', ascending=False)

# 5. Generate Word Cloud (Full Screen)
plt.figure(figsize=(16, 9))  # Full screen size similar to options(repr.plot.width=16, repr.plot.height=9)

wc = WordCloud(width=1200, height=900,
               background_color='white',
               max_words=300,
               min_font_size=8,
               random_state=123,
               prefer_horizontal=0.7,
               colormap='Dark2')

wc.generate_from_frequencies(dict(zip(df_words['word'], df_words['freq'])))

plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

1.4 Treemap

1.4.1 R Code (Treemap)

# ==============================
# 1. Install & Load Required Packages
# ==============================


# Load libraries
library(treemapify)

## Warning: package 'treemapify' was built under R version 4.4.3

library(ggplot2)
library(dplyr)

# ==============================
# 2. Prepare Aggregated Treemap Data
# ==============================
data_bisnis <- read.csv("data/bab8/data_bisnis.csv")
tree_data <- data_bisnis %>%
  group_by(Product_Category, Region) %>%
  summarise(
    Total_Sales = sum(Total_Price, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    label_combined = paste0(Region, "\n", round(Total_Sales, 0))
  )

# ==============================
# 3. Create Static Tree Map with Combined Labels
# ==============================

ggplot(tree_data, aes(
  area = Total_Sales,
  fill = Product_Category,
  subgroup = Product_Category
)) +
  geom_treemap() +
  geom_treemap_subgroup_border(color = "white") +

  geom_treemap_text(
    aes(label = label_combined),
    colour = "white",
    place = "centre",
    grow = FALSE,
    reflow = TRUE,
    size = 50 / .pt,       # Adjust overall font size
    min.size = 3
  ) +

  labs(
    title = "Tree Map of Total Sales by Product Category and Region"
  ) +
  theme_minimal()

1.4.2 Python Code (Treemap)

import pandas as pd
import matplotlib.pyplot as plt
import squarify
import matplotlib.patches as mpatches

# ==============================
# 1. Prepare Data
# ==============================

# Load data
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv", dtype=str)

# Convert 'Total_Price' to numeric
data_bisnis['Total_Price'] = pd.to_numeric(data_bisnis['Total_Price'], errors='coerce')

# ==============================
# 2. Prepare Aggregated Treemap Data
# ==============================

# Aggregate sales by Product_Category and Region
tree_data = (
    data_bisnis
    .groupby(['Product_Category', 'Region'], as_index=False)
    .agg(Total_Sales=('Total_Price', 'sum'))
)

# Create combined label
tree_data['label_combined'] = tree_data.apply(
    lambda row: f"{row['Region']}\n{round(row['Total_Sales'], 0)}", axis=1
)

# ==============================
# 3. Create Static Treemap with Legend
# ==============================

# Treemap values
sizes = tree_data['Total_Sales'].values
labels = tree_data['label_combined'].values
categories = tree_data['Product_Category'].values

# Color palette like R (Set3 from ggplot2)
unique_categories = tree_data['Product_Category'].unique()
palette = plt.get_cmap('Set3')
color_dict = {
    cat: palette(i / len(unique_categories)) for i, cat in enumerate(unique_categories)
}
colors = [color_dict[cat] for cat in categories]

# Create plot
fig, ax = plt.subplots(figsize=(16, 9))
squarify.plot(
    sizes=sizes,
    label=labels,
    color=colors,
    alpha=0.85,
    ax=ax,
    text_kwargs={'fontsize': 16, 'color': 'black'}
)

# Set title and remove axis
ax.set_title("Tree Map of Total Sales by Product Category and Region", fontsize=20)
ax.axis('off')

# ==============================
# 4. Add Legend
# ==============================

# Create legend handles
legend_handles = [
    mpatches.Patch(color=color_dict[cat], label=cat) for cat in unique_categories
]

# Place legend outside the plot (right side)
plt.legend(
    handles=legend_handles,
    title='Product Category',
    bbox_to_anchor=(1.05, 1),
    loc='upper left'
)

# ==============================
# 5. Show Plot
# ==============================

plt.tight_layout()
plt.show();

1.5 Histogram

1.5.1 R Code (Histogram)

# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis <- read.csv("data/bab8/data_bisnis.csv")
data_bisnis <- data_bisnis %>%
  mutate(Quantity = as.numeric(Quantity))

# ==============================
# 3. Create Histogram of Quantity with Custom Font Sizes
# ==============================
ggplot(data_bisnis, aes(x = Quantity)) +
  geom_histogram(binwidth = 1,
                 fill = "skyblue",
                 color = "gray",
                 alpha = 0.7) +
  labs(
    title = "Histogram of Quantity Distribution",
    x = "Quantity",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 30, face = "bold"),  # Title size and bold
    axis.title.x = element_text(size = 25),               # X label size
    axis.title.y = element_text(size = 25),               # Y label size
    axis.text.x = element_text(size = 20),                # X axis numbers size
    axis.text.y = element_text(size = 20)                 # Y axis numbers size
  )

1.5.2 Python Code (Histogram)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 1. Prepare Data
# ==============================
# Assuming data_bisnis is a pandas DataFrame loaded already

# Load data
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv", dtype=str)
data_bisnis['Quantity'] = pd.to_numeric(data_bisnis['Quantity'], errors='coerce')

# Drop missing values in Quantity
data_clean = data_bisnis.dropna(subset=['Quantity'])

# ==============================
# 2. Plot Histogram with Custom Font Sizes
# ==============================
plt.figure(figsize=(16, 9))

sns.histplot(data_clean['Quantity'], 
             binwidth=1, 
             color='skyblue', 
             alpha=0.7, 
             edgecolor='gray')

plt.title('Histogram of Quantity Distribution', fontsize=30, fontweight='bold')
plt.xlabel('Quantity', fontsize=25)
plt.ylabel('Frequency', fontsize=25)

plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

plt.tight_layout()
plt.show();

1.6 Density Plot

1.6.1 R Code (Density Plot)

# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis <- read.csv("data/bab8/data_bisnis.csv")

# Ensure Quantity is numeric and remove NAs
data_bisnis <- data_bisnis %>%
  mutate(Quantity = as.numeric(Quantity)) %>%
  filter(!is.na(Quantity))

# Calculate mean of Quantity
mean_quantity <- mean(data_bisnis$Quantity, na.rm = TRUE)

# Estimate density to get y-position for label
density_data <- density(data_bisnis$Quantity)
max_y <- max(density_data$y)

# ==============================
# 3. Create Density Plot with Mean Line and Label
# ==============================
ggplot(data_bisnis, aes(x = Quantity)) +
  geom_density(fill = "skyblue", alpha = 0.6) +
  geom_vline(xintercept = mean_quantity, color = "red", 
             linetype = "dashed", linewidth = 1) +
  geom_text(
    data = data.frame(x = mean_quantity, y = max_y * 0.8),
    aes(x = x, y = y),
    label = paste("Mean =", round(mean_quantity, 2)),
    color = "black",
    angle = 90,
    vjust = -0.5,
    size = 8,
    fontface = "bold",
    inherit.aes = FALSE
  ) +
  labs(
    title = "Density Plot of Quantity with Mean",
    x = "Quantity",
    y = "Density"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 35, face = "bold"),
    axis.title = element_text(size = 30),
    axis.text = element_text(size = 25)
  )

1.6.2 Python Code (Density Plot)

# ==============================
# 1. Load Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import gaussian_kde

# ==============================
# 2. Prepare Data
# ==============================
# Load data
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv", dtype=str)
data_bisnis['Quantity'] = pd.to_numeric(data_bisnis['Quantity'], errors='coerce')
data_bisnis = data_bisnis.dropna(subset=['Quantity'])

mean_quantity = data_bisnis['Quantity'].mean()

# ==============================
# 3. Calculate density manually (to get y max for label)
# ==============================
values = data_bisnis['Quantity'].values
density = gaussian_kde(values)
x_vals = np.linspace(values.min(), values.max(), 1000)
y_vals = density(x_vals)
max_density_y = y_vals.max()

# ==============================
# 4. Plot density, mean line, and text label
# ==============================
plt.figure(figsize=(16, 9))

# Plot density with seaborn for nice fill
sns.kdeplot(data=data_bisnis, x='Quantity', fill=True, color='skyblue', alpha=0.6)

# Add vertical dashed mean line
plt.axvline(mean_quantity, color='red', linestyle='--', linewidth=1)

# Add text label near the mean line
plt.text(
    mean_quantity,
    max_density_y * 0.8,
    f'Mean = {mean_quantity:.2f}',
    rotation=90,
    verticalalignment='bottom',
    horizontalalignment='right',
    color='black',
    fontsize=12,
    fontweight='bold'
)

# Labels and title
plt.title("Density Plot of Quantity with Mean", fontsize=30, fontweight='bold')
plt.xlabel("Quantity", fontsize=25)
plt.ylabel("Density", fontsize=25)

plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

plt.show();

1.7 Boxplot

1.7.1 R Code (Boxplot)

# ==============================
# 1. Load Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Load and Prepare Data
# ==============================
data_bisnis <- read.csv("data/bab8/data_bisnis.csv", stringsAsFactors = FALSE)

# Convert Quantity to numeric and filter missing
data_bisnis <- data_bisnis %>%
  mutate(Quantity = as.numeric(Quantity)) %>%
  filter(!is.na(Quantity))

# Compute IQR-based outlier bounds
Q1 <- quantile(data_bisnis$Quantity, 0.25)
Q3 <- quantile(data_bisnis$Quantity, 0.75)
IQR_value <- IQR(data_bisnis$Quantity)
lower_whisker <- Q1 - 1.5 * IQR_value
upper_whisker <- Q3 + 1.5 * IQR_value

# ==============================
# 3. Summarize Statistics
# ==============================
stats <- data_bisnis %>%
  summarise(
    Mean = mean(Quantity),
    Q1 = Q1,
    Median = median(Quantity),
    Q3 = Q3,
    Min = min(Quantity),
    Max = max(Quantity),
    Outliers = sum(Quantity < lower_whisker | Quantity > upper_whisker)
  )

# ==============================
# 4. Basic Boxplot with Jitter and Annotations
# ==============================
ggplot(data_bisnis, aes(x = factor(1), y = Quantity)) +
  # Basic boxplot
  geom_boxplot(fill = "skyblue", outlier.shape = NA) +
  
  # Add jittered points, highlight outliers in red
  geom_jitter(aes(color = Quantity < lower_whisker | Quantity > upper_whisker),
              width = 0.1, size = 2, alpha = 0.5) +
  scale_color_manual(values = c("FALSE" = "black", "TRUE" = "red"), guide = "none") +
  
  # Highlight max point if not an outlier
  geom_point(data = data_bisnis %>% filter(Quantity == stats$Max[[1]] & Quantity <= upper_whisker),
             aes(x = factor(1), y = Quantity),
             color = "red", size = 20) +
  
  # Annotations
  ggplot2::annotate("text", x = 1.2, y = stats$Mean[[1]], 
           label = paste("Mean:", round(stats$Mean[[1]], 2)), 
           hjust = 0, fontface = "bold", color = "blue") +
  ggplot2::annotate("text", x = 1.2, y = stats$Q1[[1]], 
           label = paste("Q1:", round(stats$Q1[[1]], 2)), 
           hjust = 0, color = "darkgreen") +
  ggplot2::annotate("text", x = 1.2, y = stats$Median[[1]], 
           label = paste("Median:", round(stats$Median[[1]], 2)), 
           hjust = 0, color = "purple") +
  ggplot2::annotate("text", x = 1.2, y = stats$Q3[[1]], 
           label = paste("Q3:", round(stats$Q3[[1]], 2)), 
           hjust = 0, color = "darkgreen") +
  ggplot2::annotate("text", x = 1.2, y = stats$Min[[1]], 
           label = paste("Min:", round(stats$Min[[1]], 2)), 
           hjust = 0, color = "orange") +
  ggplot2::annotate("text", x = 1.2, y = stats$Max[[1]], 
           label = paste("Max:", round(stats$Max[[1]], 2)), 
           hjust = 0, color = "orange") +
  ggplot2::annotate("text", x = 1, y = stats$Max[[1]] + 0.05 * stats$Max[[1]], 
           label = paste("Outliers:", stats$Outliers[[1]]), 
           color = "red", fontface = "italic", hjust = 0.5) +

  # Plot formatting
  labs(
    title = "Boxplot of Quantity with Jitter",
    x = NULL,
    y = "Quantity"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.title = element_text(size = 50, face = "bold"),
    axis.title = element_text(size = 40),
    axis.text = element_text(size = 30)
  )

1.7.2 Python Code (Boxplot)

# ==============================
# 1. Load Libraries
# ==============================
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# ==============================
# 2. Load and Prepare Data
# ==============================
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")
data_bisnis['Quantity'] = pd.to_numeric(data_bisnis['Quantity'], errors='coerce')
data_bisnis = data_bisnis.dropna(subset=['Quantity'])

# Compute IQR-based outlier bounds
Q1 = data_bisnis['Quantity'].quantile(0.25)
Q3 = data_bisnis['Quantity'].quantile(0.75)
IQR = Q3 - Q1
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Flag outliers
data_bisnis['Outlier'] = (data_bisnis['Quantity'] < lower_whisker) | (data_bisnis['Quantity'] > upper_whisker)

# ==============================
# 3. Calculate Summary Statistics
# ==============================
mean_val = data_bisnis['Quantity'].mean()
median_val = data_bisnis['Quantity'].median()
min_val = data_bisnis['Quantity'].min()
max_val = data_bisnis['Quantity'].max()
outliers = data_bisnis['Outlier'].sum()

# ==============================
# 4. Plot: Boxplot with Jitter + Annotations
# ==============================
plt.figure(figsize=(14, 10))

# Basic boxplot (no outliers shown)
sns.boxplot(y=data_bisnis['Quantity'], x=[""] * len(data_bisnis), width=0.4, color='skyblue', showfliers=False)

# Jittered points with red color for outliers
sns.stripplot(
    y=data_bisnis['Quantity'],
    x=[""] * len(data_bisnis),
    hue=data_bisnis['Outlier'],
    palette={False: 'black', True: 'red'},
    dodge=False,
    jitter=0.2,
    size=5,
    alpha=0.5
)

plt.legend([], [], frameon=False)  # Remove legend

# Highlight max point if it's not an outlier
if max_val <= upper_whisker:
    plt.scatter(0, max_val, color='red', s=300, zorder=10)

# ==============================
# 5. Annotations
# ==============================
def annotate_stat(y, label, color, weight='normal'):
    plt.text(
        x=0.2,
        y=y,
        s=f"{label}: {round(y, 2)}",
        color=color,
        fontsize=14,
        fontweight=weight,
        ha='left'
    )

annotate_stat(mean_val, "Mean", "blue", "bold")
annotate_stat(Q1, "Q1", "darkgreen")
annotate_stat(median_val, "Median", "purple")
annotate_stat(Q3, "Q3", "darkgreen")
annotate_stat(min_val, "Min", "orange")
annotate_stat(max_val, "Max", "orange")

# Annotation for outlier count above the max
plt.text(
    x=0,
    y=max_val + 0.05 * max_val,
    s=f"Outliers: {outliers}",
    color="red",
    fontstyle="italic",
    fontsize=16,
    ha='center'
)

# ==============================
# 6. Final Formatting
# ==============================
plt.title("Boxplot of Quantity with Jitter", fontsize=32, fontweight='bold')
plt.xlabel("")
plt.ylabel("Quantity", fontsize=24)
plt.xticks([])
plt.yticks(fontsize=14)
plt.tight_layout()
plt.show()

1.8 Violin Plot

1.8.1 R Code (Violin Plot)

# ==============================
# 1. Load Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Load and Prepare Data
# ==============================
data_bisnis <- read.csv("data/bab8/data_bisnis.csv", stringsAsFactors = FALSE)

# Clean and convert Quantity to numeric
data_bisnis <- data_bisnis %>%
  mutate(Quantity = as.numeric(Quantity)) %>%
  filter(!is.na(Quantity))

# Calculate quartiles and IQR for outlier detection
Q1 <- quantile(data_bisnis$Quantity, 0.25)
Q3 <- quantile(data_bisnis$Quantity, 0.75)
IQR_value <- IQR(data_bisnis$Quantity)
upper_whisker <- Q3 + 1.5 * IQR_value
lower_whisker <- Q1 - 1.5 * IQR_value

# Mark outliers
data_bisnis <- data_bisnis %>%
  mutate(
    is_outlier = ifelse(Quantity < lower_whisker | Quantity > upper_whisker, "Outlier", "Normal")
  )

# ==============================
# 3. Summarize Statistics
# ==============================
stats <- data_bisnis %>%
  summarise(
    Mean = mean(Quantity),
    Q1 = Q1,
    Median = median(Quantity),
    Q3 = Q3,
    Min = min(Quantity),
    Max = max(Quantity),
    Outliers = sum(is_outlier == "Outlier")
  )

# ==============================
# 4. Create Violin Plot with Colored Jitter and Annotations
# ==============================
ggplot(data_bisnis, aes(x = factor(1), y = Quantity)) +
  geom_violin(fill = "skyblue", trim = FALSE) +
  geom_boxplot(width = 0.1, outlier.shape = NA, color = "black") +
  geom_jitter(aes(color = is_outlier), width = 0.1, alpha = 0.6, size = 2) +
  geom_point(data = data_bisnis %>%
               filter(Quantity == stats$Max[[1]] & Quantity <= upper_whisker),
             aes(x = factor(1), y = Quantity),
             color = "red", size = 8) +

  # Annotations via geom_text
  geom_text(data = stats, aes(x = 1.2, y = Mean, label = paste("Mean:", round(Mean, 2))),
            hjust = 0, color = "blue", fontface = "bold") +
  geom_text(data = stats, aes(x = 1.2, y = Q1, label = paste("Q1:", round(Q1, 2))),
            hjust = 0, color = "darkgreen") +
  geom_text(data = stats, aes(x = 1.2, y = Median, label = paste("Median:", round(Median, 2))),
            hjust = 0, color = "purple") +
  geom_text(data = stats, aes(x = 1.2, y = Q3, label = paste("Q3:", round(Q3, 2))),
            hjust = 0, color = "darkgreen") +
  geom_text(data = stats, aes(x = 1.2, y = Min, label = paste("Min:", round(Min, 2))),
            hjust = 0, color = "orange") +
  geom_text(data = stats, aes(x = 1.2, y = Max, label = paste("Max:", round(Max, 2))),
            hjust = 0, color = "orange") +
  geom_text(data = stats, aes(x = 1, y = Max + 0.05 * Max,
                             label = paste("Outliers:", Outliers)),
            color = "red", fontface = "italic", hjust = 0.5) +

  scale_color_manual(values = c("Normal" = "black", "Outlier" = "red")) +

  labs(
    title = "Violin Plot of Quantity with Outlier Highlighted",
    x = NULL,
    y = "Quantity",
    color = "Point Type"
  ) +
  theme_minimal() +
  theme_minimal(base_size = 15) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.title = element_text(size = 30, face = "bold"),
    axis.title = element_text(size = 20),
    axis.text = element_text(size = 20),
    legend.position = "right",
    legend.title = element_text(size = 20),
    legend.text = element_text(size = 15)
  )

1.8.2 Python Code (Violin Plot)

# ==============================
# 1. Load Libraries
# ==============================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# ==============================
# 2. Load and Prepare Data
# ==============================
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")
data_bisnis['Quantity'] = pd.to_numeric(data_bisnis['Quantity'], errors='coerce')
data_bisnis = data_bisnis.dropna(subset=['Quantity'])

# IQR calculations
Q1 = data_bisnis['Quantity'].quantile(0.25)
Q3 = data_bisnis['Quantity'].quantile(0.75)
IQR = Q3 - Q1
lower_whisker = Q1 - 1.5 * IQR
upper_whisker = Q3 + 1.5 * IQR

# Mark outliers
data_bisnis['is_outlier'] = np.where(
    (data_bisnis['Quantity'] < lower_whisker) | (data_bisnis['Quantity'] > upper_whisker),
    'Outlier', 'Normal'
)

# ==============================
# 3. Compute Summary Statistics
# ==============================
stats = {
    'Mean': data_bisnis['Quantity'].mean(),
    'Q1': Q1,
    'Median': data_bisnis['Quantity'].median(),
    'Q3': Q3,
    'Min': data_bisnis['Quantity'].min(),
    'Max': data_bisnis['Quantity'].max(),
    'Outliers': (data_bisnis['is_outlier'] == 'Outlier').sum()
}

# ==============================
# 4. Create Violin Plot + Jitter + Boxplot + Annotations
# ==============================
plt.figure(figsize=(12, 10))

# Violin plot
sns.violinplot(y=data_bisnis['Quantity'], x=[""]*len(data_bisnis), inner=None, color='skyblue')

# Boxplot (no outliers)
sns.boxplot(y=data_bisnis['Quantity'], x=[""]*len(data_bisnis), width=0.1, showcaps=True,
            boxprops={'facecolor':'none', 'edgecolor':'black'},
            whiskerprops={'color':'black'}, medianprops={'color':'black'},
            showfliers=False)

# Jitter plot (color by outlier status)
sns.stripplot(
    y=data_bisnis['Quantity'],
    x=[""] * len(data_bisnis),
    hue=data_bisnis['is_outlier'],
    palette={'Normal': 'black', 'Outlier': 'red'},
    jitter=True,
    alpha=0.6,
    size=5,
    dodge=False
)

# Titik merah besar jika Max bukan outlier
if stats['Max'] <= upper_whisker:
    plt.scatter(0, stats['Max'], color='red', s=200, zorder=10)

# ==============================
# 5. Annotations
# ==============================
def add_text(label, y, color, bold=False, italic=False):
    plt.text(
        x=0.2,
        y=y,
        s=f"{label}: {round(y, 2)}" if isinstance(y, (int, float)) else label,
        color=color,
        fontsize=13,
        fontweight='bold' if bold else 'normal',
        fontstyle='italic' if italic else 'normal',
        ha='left'
    )

add_text("Mean", stats['Mean'], "blue", bold=True)
add_text("Q1", stats['Q1'], "darkgreen")
add_text("Median", stats['Median'], "purple")
add_text("Q3", stats['Q3'], "darkgreen")
add_text("Min", stats['Min'], "orange")
add_text("Max", stats['Max'], "orange")
add_text(f"Outliers: {stats['Outliers']}", stats['Max'] + 0.05 * stats['Max'], "red", italic=True)

# ==============================
# 6. Final Layout
# ==============================
plt.title("Violin Plot of Quantity with Outlier Highlighted", fontsize=22, fontweight='bold')
plt.xlabel("")
plt.ylabel("Quantity", fontsize=16)
plt.xticks([])
plt.yticks(fontsize=12)
plt.legend(title="Point Type", title_fontsize=14, fontsize=12, loc="upper right")
plt.tight_layout()
plt.show()

1.9 Grouped Bar Chart

1.9.1 R Code (Grouped Bar Chart)

# ==============================
# 1. Load Libraries
# ==============================
library(ggplot2)
library(dplyr)
library(readr)

## Warning: package 'readr' was built under R version 4.4.2

## 
## Attaching package: 'readr'

## The following object is masked from 'package:scales':
## 
##     col_factor

# ==============================
# 2. Load Data
# ==============================
data_bisnis <- read_csv("data/bab8/data_bisnis.csv")

## New names:
## • `` -> `...1`

## Rows: 500 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): Transaction_ID, Customer_ID, Product_Category, Product_ID, Region...
## dbl  (15): ...1, Quantity, Unit_Price, Discount, Delivery_Time, Total_Price,...
## lgl   (1): ID_HasPattern
## date  (1): Transaction_Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# ==============================
# 3. Data Summarization
# ==============================
sales_summary <- data_bisnis %>%
  group_by(Product_Category, Region) %>%
  summarise(Total_Sales = sum(Total_Price, na.rm = TRUE), .groups = "drop")

# ==============================
# 4. Plot Grouped Bar Chart
# ==============================
ggplot(sales_summary, aes(x = Product_Category, y = Total_Sales, fill = Region)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.8)) +

# ==============================
# 5. Customization
# ==============================
  labs(
    title = "Total Sales by Product Category and Region",
    x = "Product Category",
    y = "Total Sales (USD)",
    fill = "Region"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 18),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 12),
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 13),
    legend.text = element_text(size = 12)
  ) +
  scale_y_continuous(labels = scales::comma) +
  guides(fill = guide_legend(title.position = "top")) +
  geom_hline(yintercept = 0, color = "black")  # Optional grid line base

1.9.2 Python Code (Grouped Bar Chart)

# ==============================
# 1. Load Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Load Data
# ==============================
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")

# ==============================
# 3. Data Summarization
# ==============================
sales_summary = (
    data_bisnis
    .groupby(['Product_Category', 'Region'], as_index=False)
    .agg(Total_Sales=('Total_Price', 'sum'))
)

# ==============================
# 4. Plot Grouped Bar Chart
# ==============================
plt.figure(figsize=(12, 8))
sns.barplot(
    data=sales_summary,
    x='Product_Category',
    y='Total_Sales',
    hue='Region',
    dodge=True
)

# ==============================
# 5. Customization
# ==============================
plt.title("Total Sales by Product Category and Region", fontsize=18, fontweight='bold')
plt.xlabel("Product Category", fontsize=14)
plt.ylabel("Total Sales (USD)", fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(fontsize=12)
plt.legend(title="Region", fontsize=12, title_fontsize=13)
plt.tight_layout()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

1.10 Ridgeline Plot

1.10.1 R Code (Ridgeline Plot)

# ==============================
# 1. Load Libraries
# ==============================
library(ggplot2)
library(dplyr)
library(ggridges)

## Warning: package 'ggridges' was built under R version 4.4.3

library(readr)
library(scales)

# ==============================
# 2. Load and Filter Data
# ==============================
data_bisnis <- read_csv("data/bab8/data_bisnis.csv")

## New names:
## Rows: 500 Columns: 25
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (8): Transaction_ID, Customer_ID, Product_Category, Product_ID, Region... dbl
## (15): ...1, Quantity, Unit_Price, Discount, Delivery_Time, Total_Price,... lgl
## (1): ID_HasPattern date (1): Transaction_Date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

# Filter NA, NaN, Inf pada Price_per_Unit
data_bisnis_filtered <- data_bisnis %>%
  filter(!is.na(Price_per_Unit), is.finite(Price_per_Unit))

# ==============================
# 3. Create Ridgeline Plot
# ==============================
ggplot(data_bisnis_filtered, aes(x = Price_per_Unit, y = Region, fill = Region)) +
  geom_density_ridges(
    scale = 1.2,
    alpha = 0.7,
    bandwidth = 1.2
  ) +
  scale_x_continuous(
    labels = label_number(big.mark = ".", decimal.mark = ",", prefix = "Rp")
  ) +
  labs(
    title = "Distribution of Price per Unit by Region",
    x = "Price per Unit",
    y = NULL
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(size = 24, face = "bold", hjust = 0.5),
    axis.title.x = element_text(size = 18),
    axis.text = element_text(size = 12),
    legend.position = "none",
    strip.text = element_text(size = 14, face = "bold")
  )

1.10.2 Python Code (Ridgeline Plot)

# ==============================
# 1. Load Libraries
# ==============================
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import numpy as np

# ==============================
# 2. Load and Filter Data
# ==============================
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")

# Filter out NA, NaN, Inf values from 'Price_per_Unit'
data_bisnis_filtered = data_bisnis[
    data_bisnis["Price_per_Unit"].apply(lambda x: pd.notnull(x) and np.isfinite(x))
]

# ==============================
# 3. Create Ridgeline Plot
# ==============================
# Set style
sns.set(style="whitegrid", rc={"axes.titlesize":30, "axes.labelsize":20})

# Create the ridgeline using seaborn FacetGrid
g = sns.FacetGrid(
    data_bisnis_filtered,
    row="Region",
    hue="Region",
    aspect=4,
    height=1.2,
    palette="Set2"
)

g.map(sns.kdeplot, "Price_per_Unit", bw_adjust=1.2, fill=True, alpha=0.7)

# Remove axis details and add custom formatting
for ax in g.axes.flat:
    ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f"Rp{x:,.0f}".replace(",", ".")))

g.set_titles(row_template="{row_name}", size=16, weight='bold')
g.set_axis_labels("Price per Unit", None)
g.fig.subplots_adjust(hspace=-0.5)
g.set(yticks=[])

plt.suptitle("Distribution of Price per Unit by Region", fontsize=24, weight='bold')
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

1.11 Boxplot by Category

1.11.1 R Code (Boxplot by Category)

# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Prepare Data
# ==============================
# Convert Quantity to numeric and remove NA
data_bisnis <- read.csv("data/bab8/data_bisnis.csv", stringsAsFactors = FALSE)
data_bisnis <- data_bisnis %>%
  mutate(Quantity = as.numeric(Quantity)) %>%
  filter(!is.na(Quantity))

# ==============================
# 3. Create Boxplot
# ==============================
ggplot(data_bisnis, aes(x = Product_Category, y = Quantity, fill = Product_Category)) +
  geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 2) +  # Boxplot with red outliers
  labs(
    title = "Boxplot of Quantity by Product Category",
    x = "Product Category",
    y = "Quantity"
  ) +
  theme_minimal() +
  theme_minimal(base_size = 40) +
  theme(
    plot.title = element_text(size = 30, face = "bold"),
    axis.title = element_text(size = 25),
    axis.text = element_text(size = 20),
    legend.position = "none"
  )

1.11.2 Python Code (Boxplot by Category)

# ==============================
# 1. Load Required Libraries
# ==============================
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ==============================
# 2. Prepare Data
# ==============================
# Read CSV and convert Quantity to numeric, remove NA
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")
data_bisnis['Quantity'] = pd.to_numeric(data_bisnis['Quantity'], errors='coerce')
data_bisnis = data_bisnis.dropna(subset=['Quantity'])

# ==============================
# 3. Create Boxplot
# ==============================
plt.figure(figsize=(20, 12))  # Ukuran gambar besar (sebanding dengan base_size = 40 di R)
sns.boxplot(
    data=data_bisnis,
    x='Product_Category',
    y='Quantity',
    palette='Set3',
    fliersize=6,  # ukuran outlier
    flierprops=dict(marker='o', color='red', markersize=6)  # properti outlier merah
)

# Tambahkan judul dan label
plt.title("Boxplot of Quantity by Product Category", fontsize=30, weight='bold')
plt.xlabel("Product Category", fontsize=25)
plt.ylabel("Quantity", fontsize=25)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

# Hilangkan legend
plt.legend([],[], frameon=False)

# Tampilkan plot
plt.tight_layout()
plt.show()

1.12 Lollipop Chart

1.12.1 R Code (Lollipop Chart )

# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis <- read.csv("data/bab8/data_bisnis.csv", stringsAsFactors = FALSE)

# Summarize total sales by Product_Category and Region
sales_grouped <- data_bisnis %>%
  group_by(Product_Category, Region) %>%
  summarise(Total_Sales = sum(Total_Price, na.rm = TRUE), .groups = "drop")

# ==============================
# 3. Grouped Lollipop Chart
# ==============================
ggplot(sales_grouped, aes(x = Total_Sales, y = reorder(Product_Category, Total_Sales), color = Region)) +
  geom_segment(aes(x = 0, xend = Total_Sales, y = Product_Category, yend = Product_Category), size = 5) +
  geom_point(size = 5) +
  labs(
    title = "Grouped Lollipop Chart",
    x = "Total Sales",
    y = "Product Category"
  ) +
  theme_minimal() +
  theme_minimal(base_size = 20) +
  theme(
    axis.text = element_text(size = 20),
    axis.title = element_text(size = 20),
    plot.title = element_text(size = 10, face = "bold")
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

1.12.2 Python Code(Lollipop Chart )

# ==============================
# 1. Load Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Prepare Data
# ==============================
# Load dataset
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")

# Hitung total sales per kategori dan region
sales_grouped = (
    data_bisnis
    .groupby(['Product_Category', 'Region'], as_index=False)
    .agg(Total_Sales=('Total_Price', 'sum'))
)

# ==============================
# 3. Grouped Lollipop Chart
# ==============================
# Set plot style and size
sns.set(style="whitegrid")
plt.figure(figsize=(16, 10))

# Warna per region
region_colors = dict(zip(
    sales_grouped['Region'].unique(),
    sns.color_palette("tab10", n_colors=sales_grouped['Region'].nunique())
))

# Loop per region
for region in sales_grouped['Region'].unique():
    subset = sales_grouped[sales_grouped['Region'] == region]
    subset = subset.sort_values("Total_Sales")

    plt.hlines(
        y=subset['Product_Category'],
        xmin=0,
        xmax=subset['Total_Sales'],
        color=region_colors[region],
        linewidth=5,
        label=region
    )
    plt.plot(
        subset['Total_Sales'],
        subset['Product_Category'],
        'o',
        markersize=10,
        color=region_colors[region]
    )

# Customisasi plot
plt.title("Grouped Lollipop Chart", fontsize=20, fontweight='bold')
plt.xlabel("Total Sales", fontsize=16)
plt.ylabel("Product Category", fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(title="Region", fontsize=12, title_fontsize=14)
plt.tight_layout()
plt.show()

1.13 Heatmap

1.13.1 R Code (Heatmap)

# ==============================
# 1. Load Libraries
# ==============================
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.3

## Warning: package 'tibble' was built under R version 4.4.3

## Warning: package 'forcats' was built under R version 4.4.3

## Warning: package 'lubridate' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ NLP::annotate()     masks ggplot2::annotate()
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# ==============================
# 2. Load and Prepare Data
# ==============================
# Load CSV file
data_bisnis <- read_csv("data/bab8/data_bisnis.csv")

## New names:
## Rows: 500 Columns: 25
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (8): Transaction_ID, Customer_ID, Product_Category, Product_ID, Region... dbl
## (15): ...1, Quantity, Unit_Price, Discount, Delivery_Time, Total_Price,... lgl
## (1): ID_HasPattern date (1): Transaction_Date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

# ==============================
# 3. Pivot Table untuk Rata-Rata Total_Price
# ==============================
avg_price_data <- data_bisnis %>%
  group_by(Product_Category, Region) %>%
  summarise(Average_Total_Price = mean(Total_Price, na.rm = TRUE)) %>%
  ungroup()

## `summarise()` has grouped output by 'Product_Category'. You can override using
## the `.groups` argument.

# ==============================
# 4. Plot Heatmap
# ==============================
ggplot(avg_price_data, aes(x = Region, y = Product_Category, fill = Average_Total_Price)) +
  geom_tile(color = "gray", linewidth = 0.5) +
  geom_text(aes(label = round(Average_Total_Price, 0)), color = "black") +
  scale_fill_gradient(low = "white", high = "orangered") +
  labs(
    title = "Average Total Price per Product Category and Region",
    x = "Region",
    y = "Product Category",
    fill = "Avg Total Price"
  ) +
  theme_minimal(base_size = 14)

1.13.2 Python Code (Heatmap)

# ==============================
# 1. Load Libraries
# ==============================
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ==============================
# 2. Load and Prepare Data
# ==============================
# Load CSV file
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")

# ==============================
# 3. Pivot Table untuk Rata-Rata Total_Price
# ==============================
avg_price_data = data_bisnis.pivot_table(
    index="Product_Category",
    columns="Region",
    values="Total_Price",
    aggfunc="mean"
)
# ==============================
# 3. Plot Heatmap
# ==============================
plt.figure(figsize=(12, 8))
sns.heatmap(avg_price_data, annot=True, fmt=".0f", cmap="OrRd", linewidths=0.5, linecolor="gray")

plt.title("Average Total Price per Product Category and Region", fontsize=16)
plt.xlabel("Region")
plt.ylabel("Product Category")
plt.tight_layout()
plt.show()

1.14 Scatter Plot

1.14.1 R Code (Scatter Plot)

# ==============================
# 1. Load Required Libraries
# ==============================
library(tidyverse)

# ==============================
# 2. Prepare Data
# ==============================
# Load dataset
data_bisnis <- read_csv("data/bab8/data_bisnis.csv")

## New names:
## Rows: 500 Columns: 25
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (8): Transaction_ID, Customer_ID, Product_Category, Product_ID, Region... dbl
## (15): ...1, Quantity, Unit_Price, Discount, Delivery_Time, Total_Price,... lgl
## (1): ID_HasPattern date (1): Transaction_Date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

# Hitung Total_Price jika belum tersedia
data_bisnis <- data_bisnis %>%
  mutate(Total_Price = Quantity * Unit_Price * (1 - Discount))

# ==============================
# 3. Scatter Plot
# ==============================
# Scatter plot: Unit Price vs Total Price
ggplot(data_bisnis, aes(x = Unit_Price, y = Total_Price, color = Region)) +
  geom_point(alpha = 0.7) +
  labs(
    title = "Scatter Plot: Unit Price vs Total Price",
    x = "Unit Price",
    y = "Total Price",
    color = "Region"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(size = 18, face = "bold"),
    legend.title = element_text(size = 12)
  )

1.14.2 Python Code (Scatter Plot)

# ==============================
# 1. Load Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Prepare Data
# ==============================
# Load dataset
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")

# Hitung Total_Price jika belum tersedia
data_bisnis['Total_Price'] = data_bisnis['Quantity'] * data_bisnis['Unit_Price'] * (1 - data_bisnis['Discount'])

# ==============================
# 3. Scatter plot
# ==============================
# Set plot size and style
plt.figure(figsize=(12, 8))
sns.set(style="whitegrid")

# Scatter plot: Unit Price vs Total Price
sns.scatterplot(data=data_bisnis, x='Unit_Price', y='Total_Price', hue='Region', alpha=0.7)

# Labels and title
plt.title('Scatter Plot: Unit Price vs Total Price', fontsize=18, fontweight='bold')
plt.xlabel('Unit Price', fontsize=14)
plt.ylabel('Total Price', fontsize=14)
plt.legend(title='Region')
plt.grid(True)
plt.tight_layout()
plt.show()

1.15 Bubble Chart

1.15.1 R Code (Bubble Chart)

# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis <- read.csv("data/bab8/data_bisnis.csv", stringsAsFactors = FALSE)

# Hitung Total_Price
data_bisnis <- data_bisnis %>%
  mutate(Total_Price = Quantity * Unit_Price * (1 - Discount))

# ==============================
# 3. Bubble Chart with Region
# ==============================
ggplot(data_bisnis, aes(x = Unit_Price, y = Total_Price, size = Quantity, color = Region)) +
  geom_point(alpha = 0.6) +
  scale_size(range = c(3, 15)) +
  labs(
    title = "Bubble Chart: Unit Price vs Total Price (Size = Quantity, Color = Region)",
    x = "Unit Price",
    y = "Total Price",
    size = "Quantity",
    color = "Region"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 18),
    legend.position = "right"
  )

1.15.2 Python Code (Bubble Chart)

# ==============================
# 1. Load Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Prepare Data
# ==============================
# Load dataset
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")

# Hitung Total_Price jika belum tersedia
data_bisnis['Total_Price'] = data_bisnis['Quantity'] * data_bisnis['Unit_Price'] * (1 - data_bisnis['Discount'])

# ==============================
# 3. Bubble Chart with Region
# ==============================
plt.figure(figsize=(14, 10))
sns.set(style="whitegrid")

# Bubble chart dengan hue berdasarkan Region dan size berdasarkan Quantity
sns.scatterplot(
    data=data_bisnis,
    x='Unit_Price',
    y='Total_Price',
    size='Quantity',
    hue='Region',
    alpha=0.6,
    edgecolor='grey',
    sizes=(50, 1000)
)

# Judul dan label
plt.title('Bubble Chart: Unit Price vs Total Price (Size = Quantity, Color = Region)', fontsize=18, fontweight='bold')
plt.xlabel('Unit Price', fontsize=14)
plt.ylabel('Total Price', fontsize=14)
plt.legend(title='Region', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

1.16 Correlation Matrix

1.16.1 R Code (Correlation Matrix)

# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)
library(corrplot)

## Warning: package 'corrplot' was built under R version 4.4.2

## corrplot 0.95 loaded

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis <- read.csv("data/bab8/data_bisnis.csv", stringsAsFactors = FALSE)

# Hitung Total_Price
data_bisnis <- data_bisnis %>%
  mutate(Total_Price = Quantity * Unit_Price * (1 - Discount))

# Pilih kolom numerik
numerical_data <- data_bisnis %>%
  select(Quantity, Unit_Price, Discount, Total_Price)

# ==============================
# 3. Correlation Matrix
# ==============================
# Hitung korelasi
cor_matrix <- cor(numerical_data, use = "complete.obs")

# Plot dengan corrplot
corrplot::corrplot(cor_matrix, method = "color", type = "upper", 
                   tl.col = "black", tl.srt = 45, addCoef.col = "black",
                   title = "Correlation Matrix of Numerical Variables", 
                   mar = c(0,0,2,0))

1.16.2 Python Code (Correlation Matrix)

# ==============================
# 1. Load Required Libraries
# ==============================
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ==============================
# 2. Prepare Data
# ==============================
# Load dataset
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")

# Hitung Total_Price jika belum tersedia
data_bisnis['Total_Price'] = data_bisnis['Quantity'] * data_bisnis['Unit_Price'] * (1 - data_bisnis['Discount'])

# Ambil kolom numerik untuk korelasi
numerical_data = data_bisnis[['Quantity', 'Unit_Price', 'Discount', 'Total_Price']]

# ==============================
# 3. Correlation Matrix
# ==============================
# Hitung korelasi
correlation_matrix = numerical_data.corr()

# Plot
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title('Correlation Matrix of Numerical Variables', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

1.17 Line Chart

1.17.1 R Code (Line Chart)

# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)
library(lubridate)

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis <- read.csv("data/bab8/data_bisnis.csv", stringsAsFactors = FALSE)

# Format tanggal
data_bisnis$Transaction_Date <- as.Date(data_bisnis$Transaction_Date)

# Hitung Total_Price
data_bisnis <- data_bisnis %>%
  mutate(Total_Price = Quantity * Unit_Price * (1 - Discount)) %>%
  mutate(Month = floor_date(Transaction_Date, "month")) %>%
  group_by(Region, Month) %>%
  summarise(Monthly_Sales = sum(Total_Price, na.rm = TRUE), .groups = 'drop')

# ==============================
# 3. Line Chart per Region
# ==============================
ggplot(data_bisnis, aes(x = Month, y = Monthly_Sales, color = Region)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  labs(
    title = "Monthly Sales by Region",
    x = "Month",
    y = "Total Sales",
    color = "Region"
  ) +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(face = "bold", size = 18))

1.17.2 Python Code (Line Chart)

# ==============================
# 1. Load Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Prepare Data
# ==============================
# Load dataset
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")

# Pastikan kolom tanggal dalam format datetime
data_bisnis['Transaction_Date'] = pd.to_datetime(data_bisnis['Transaction_Date'])

# Hitung Total_Price
data_bisnis['Total_Price'] = data_bisnis['Quantity'] * data_bisnis['Unit_Price'] * (1 - data_bisnis['Discount'])

# Buat kolom bulan
data_bisnis['Month'] = data_bisnis['Transaction_Date'].dt.to_period('M').dt.to_timestamp()

# Grup berdasarkan Region dan Month
monthly_sales_region = data_bisnis.groupby(['Region', 'Month'])['Total_Price'].sum().reset_index()

# ==============================
# 3. Line Chart per Region
# ==============================
plt.figure(figsize=(14, 6))
sns.lineplot(data=monthly_sales_region, x='Month', y='Total_Price', hue='Region', marker='o')

plt.title('Monthly Sales by Region', fontsize=18, fontweight='bold')
plt.xlabel('Month', fontsize=14)
plt.ylabel('Total Sales', fontsize=14)
plt.legend(title='Region', fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()

1.18 Area Chart

1.18.1 R Code (Area Chart)

# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)
library(lubridate)

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis <- read.csv("data/bab8/data_bisnis.csv", stringsAsFactors = FALSE)

# Format tanggal dan hitung Total_Price
data_bisnis$Transaction_Date <- as.Date(data_bisnis$Transaction_Date)
data_bisnis <- data_bisnis %>%
  mutate(Total_Price = Quantity * Unit_Price * (1 - Discount),
         Month = floor_date(Transaction_Date, "month")) %>%
  group_by(Region, Month) %>%
  summarise(Monthly_Sales = sum(Total_Price, na.rm = TRUE), .groups = 'drop')

# ==============================
# 3. Area Chart per Region
# ==============================
ggplot(data_bisnis, aes(x = Month, y = Monthly_Sales, fill = Region)) +
  geom_area(alpha = 0.6, position = "stack") +
  labs(
    title = "Monthly Sales by Region (Area Chart)",
    x = "Month",
    y = "Total Sales",
    fill = "Region"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 18),
    legend.position = "bottom"
  )

1.18.2 Python Code (Area Chart)

# ==============================
# 1. Load Required Libraries
# ==============================
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ==============================
# 2. Prepare Data
# ==============================
# Load dataset
data_bisnis = pd.read_csv("data/bab8/data_bisnis.csv")

# Pastikan kolom tanggal dalam format datetime
data_bisnis['Transaction_Date'] = pd.to_datetime(data_bisnis['Transaction_Date'])

# Hitung Total_Price
data_bisnis['Total_Price'] = data_bisnis['Quantity'] * data_bisnis['Unit_Price'] * (1 - data_bisnis['Discount'])

# Buat kolom bulan
data_bisnis['Month'] = data_bisnis['Transaction_Date'].dt.to_period('M').dt.to_timestamp()

# Agregasi per Region dan Month
monthly_region_sales = data_bisnis.groupby(['Region', 'Month'])['Total_Price'].sum().reset_index()

# Pivot data untuk area chart
pivot_data = monthly_region_sales.pivot(index='Month', columns='Region', values='Total_Price').fillna(0)

# ==============================
# 3. Area Chart per Region
# ==============================
plt.figure(figsize=(14, 6))
pivot_data.plot.area(figsize=(14, 6), cmap="tab20", alpha=0.6)

plt.title('Monthly Sales by Region (Area Chart)', fontsize=18, fontweight='bold')
plt.xlabel('Month', fontsize=14)
plt.ylabel('Total Sales', fontsize=14)
plt.grid(True)
plt.legend(title='Region', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

Data Science Progamming

Bab 1 VISUALISASI DESKRIPTIF

1.1 Bar Chart

1.1.1 R Code (Bar Chart)

1.1.2 Python Code (Bar Chart)

1.2 Pie Chart

1.2.1 R Code (Pie Chart )

1.2.2 Python Code (Pie Chart )

1.3 Word Cloud

1.3.1 R Code (Word Cloud )

1.3.2 Python Code (Word Cloud )

1.4 Treemap

1.4.1 R Code (Treemap)

1.4.2 Python Code (Treemap)

1.5 Histogram

1.5.1 R Code (Histogram)

1.5.2 Python Code (Histogram)

1.6 Density Plot

1.6.1 R Code (Density Plot)

1.6.2 Python Code (Density Plot)

1.7 Boxplot

1.7.1 R Code (Boxplot)

1.7.2 Python Code (Boxplot)

1.8 Violin Plot

1.8.1 R Code (Violin Plot)

1.8.2 Python Code (Violin Plot)

1.9 Grouped Bar Chart

1.9.1 R Code (Grouped Bar Chart)

1.9.2 Python Code (Grouped Bar Chart)

1.10 Ridgeline Plot

1.10.1 R Code (Ridgeline Plot)

1.10.2 Python Code (Ridgeline Plot)

1.11 Boxplot by Category

1.11.1 R Code (Boxplot by Category)

1.11.2 Python Code (Boxplot by Category)

1.12 Lollipop Chart

1.12.1 R Code (Lollipop Chart )

1.12.2 Python Code(Lollipop Chart )

1.13 Heatmap

1.13.1 R Code (Heatmap)

1.13.2 Python Code (Heatmap)

1.14 Scatter Plot

1.14.1 R Code (Scatter Plot)

1.14.2 Python Code (Scatter Plot)

1.15 Bubble Chart

1.15.1 R Code (Bubble Chart)

1.15.2 Python Code (Bubble Chart)

1.16 Correlation Matrix

1.16.1 R Code (Correlation Matrix)

1.16.2 Python Code (Correlation Matrix)

1.17 Line Chart

1.17.1 R Code (Line Chart)

1.17.2 Python Code (Line Chart)

1.18 Area Chart

1.18.1 R Code (Area Chart)

1.18.2 Python Code (Area Chart)