Tugas Individu
Tugas Data Science Programming 13
Data set
1 Categorical Data
1.1 Bar Chart
# Load required libraries
library(dplyr) # For data manipulation
library(ggplot2) # For creating the bar chart
library(viridis) # For color palette
library(scales) # For formatting currency labels
# Step 1: Prepare the data
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
sales_summary <- data_bisnis %>%
group_by(Product_Category) %>%
summarise(Total_Sales = sum(Total_Price, na.rm = TRUE)) %>%
arrange(desc(Total_Sales))
# Step 2: Generate a color palette
custom_colors <- viridis::turbo(n = nrow(sales_summary))
# Step 3: Create bar chart with value labels
ggplot(sales_summary, aes(x = reorder(Product_Category, -Total_Sales),
y = Total_Sales,
fill = Product_Category)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = scales::label_comma(prefix = "Rp ")(Total_Sales)),
vjust = -0.3, size = 3) +
scale_fill_manual(values = custom_colors) +
scale_y_continuous(labels = scales::label_comma(prefix = "Rp "),
expand = expansion(mult = c(0, 0.1))) +
labs(
title = "Total Sales by Product Category (2020–2024)",
subtitle = "Based on Transaction Value",
x = "Product Category",
y = "Total Sales",
caption = "@siregarbakti") +
theme_minimal(base_size = 10)
1.2 Pie Chart
# Load necessary libraries
library(dplyr) # For data manipulation
library(ggplot2) # For data visualization
library(viridis) # For color palettes
library(scales) # For formatting percentages
# Step 1: Summarize total sales by product category
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
sales_summary <- data_bisnis %>%
group_by(Product_Category) %>%
summarise(Total_Sales = sum(Total_Price, na.rm = TRUE)) %>%
arrange(desc(Total_Sales)) %>%
mutate(
Percentage = Total_Sales / sum(Total_Sales),# Calculate share
Label = paste0(Product_Category, "\n", # Create label with line break
scales::percent(Percentage, accuracy = 1)))
# Step 2: Create custom color palette
custom_colors <- viridis::turbo(n = nrow(sales_summary))
# Step 3: Plot donut chart
ggplot(sales_summary, aes(x =2, y = Percentage, fill = Product_Category)) +
geom_col(width = 1, color = "white", show.legend = FALSE) + # donut slices
coord_polar(theta = "y") + # Convert to circular layout
geom_text(aes(label = Label), # Add labels inside slices
position = position_stack(vjust = 0.5),
size = 3, color = "white", fontface = "bold") +
scale_fill_manual(values = custom_colors) +
xlim(0.5, 2.5) + # Expand size of donut
labs(
title = "Sales Distribution by Product Category (2020–2024)",
subtitle = "Based on Total Transaction Value",
caption = "@siregarbakti"
) +
theme_void(base_size = 10) + # Clean theme
theme(
plot.title = element_text(face = "bold", hjust = 0.5), # Centered title
plot.subtitle = element_text(margin = margin(t = 8, b = 20), hjust = 0.5),
plot.caption = element_text(margin = margin(t = 15), hjust = 1.5,
color = "gray20", face = "italic")
)
1.3 Word Could
# ==============================
# 1. Install & Load Required Packages
# ==============================
packages <- c("dplyr", "tm", "wordcloud", "RColorBrewer")
new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
if(length(new_packages)) install.packages(new_packages)
library(dplyr)
library(tm)
library(wordcloud)
library(RColorBrewer)
# ==============================
# 2. Read and Combine Text Columns
# ==============================
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
# Combine text columns into one
text_data <- paste(data_bisnis$Product_Category,
data_bisnis$Region,
data_bisnis$Sales_Channel,
sep = " ")
# ==============================
# 3. Clean and Prepare Text
# ==============================
corpus <- VCorpus(VectorSource(text_data))
corpus_clean <- corpus %>%
tm_map(content_transformer(tolower)) %>% # convert to lowercase
tm_map(removePunctuation) %>% # remove punctuation
tm_map(removeNumbers) %>% # remove numbers
tm_map(removeWords, stopwords("english")) %>% # remove English stopwords
tm_map(stripWhitespace) # remove extra whitespace
# Remove empty documents (if any)
non_empty_idx <- sapply(corpus_clean, function(doc) {
nchar(content(doc)) > 0
})
corpus_clean <- corpus_clean[non_empty_idx]
# ==============================
# 4. Create Term-Document Matrix & Word Frequencies
# ==============================
tdm <- TermDocumentMatrix(corpus_clean)
m <- as.matrix(tdm)
word_freqs <- sort(rowSums(m), decreasing = TRUE)
df_words <- data.frame(word = names(word_freqs), freq = word_freqs)
# ==============================
# 5. Generate Word Cloud (Full Screen)
# ==============================
set.seed(123)
wordcloud(words = df_words$word,
freq = df_words$freq,
scale = c(4, 4), # adjust for large size
min.freq = 1,
max.words = 300,
random.order = FALSE,
rot.per = 0.3,
colors = brewer.pal(8, "Dark2"))
1.4 Treemap
# ==============================
# 1. Install & Load Required Packages
# ==============================
packages <- c("treemapify", "dplyr", "ggplot2")
new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load libraries
library(treemapify)
library(ggplot2)
library(dplyr)
# ==============================
# 2. Prepare Aggregated Treemap Data
# ==============================
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
tree_data <- data_bisnis %>%
group_by(Product_Category, Region) %>%
summarise(
Total_Sales = sum(Total_Price, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
label_combined = paste0(Region, "\n", round(Total_Sales, 0))
)
# ==============================
# 3. Create Static Tree Map with Combined Labels
# ==============================
ggplot(tree_data, aes(
area = Total_Sales,
fill = Product_Category,
subgroup = Product_Category
)) +
geom_treemap() +
geom_treemap_subgroup_border(color = "white") +
geom_treemap_text(
aes(label = label_combined),
colour = "white",
place = "centre",
grow = FALSE,
reflow = TRUE,
size = 20 / .pt, # Adjust overall font size
min.size = 3
) +
labs(
title = "Tree Map of Total Sales by Product Category and Region"
) +
theme_minimal()
2 Numerical Data
2.1 Histogram
# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)
# ==============================
# 2. Prepare Data
# ==============================
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
data_bisnis <- data_bisnis %>%
mutate(Quantity = as.numeric(Quantity))
# ==============================
# 3. Create Histogram of Quantity with Custom Font Sizes
# ==============================
ggplot(data_bisnis, aes(x = Quantity)) +
geom_histogram(binwidth = 1,
fill = "skyblue",
color = "gray",
alpha = 0.7) +
labs(
title = "Histogram of Quantity Distribution",
x = "Quantity",
y = "Frequency"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 15, face = "bold"), # Title size and bold
axis.title.x = element_text(size = 10), # X label size
axis.title.y = element_text(size = 10), # Y label size
axis.text.x = element_text(size = 9), # X axis numbers size
axis.text.y = element_text(size = 9) # Y axis numbers size
)
2.2 Density Plot
# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)
# ==============================
# 2. Prepare Data
# ==============================
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
# Ensure Quantity is numeric and remove NAs
data_bisnis <- data_bisnis %>%
mutate(Quantity = as.numeric(Quantity)) %>%
filter(!is.na(Quantity))
# Calculate mean of Quantity
mean_quantity <- mean(data_bisnis$Quantity, na.rm = TRUE)
# Estimate density to get y-position for label
density_data <- density(data_bisnis$Quantity)
max_y <- max(density_data$y)
# ==============================
# 3. Create Density Plot with Mean Line and Label
# ==============================
ggplot(data_bisnis, aes(x = Quantity)) +
geom_density(fill = "skyblue", alpha = 0.6) +
geom_vline(xintercept = mean_quantity, color = "red",
linetype = "dashed", linewidth = 1) +
geom_text(
data = data.frame(x = mean_quantity, y = max_y * 0.8),
aes(x = x, y = y),
label = paste("Mean =", round(mean_quantity, 2)),
color = "black",
angle = 90,
vjust = -0.5,
size = 3, # <= Ubah ukuran label jadi lebih kecil & enak dilihat
fontface = "bold",
inherit.aes = FALSE
) +
labs(
title = "Density Plot of Quantity with Mean",
x = "Quantity",
y = "Density"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold"), # <= dari 25 ke 16
axis.title = element_text(size = 12), # <= dari 20 ke 12
axis.text = element_text(size = 10) # <= dari 15 ke 10
)
2.3 Box Plot
# ==============================
# 1. Load Libraries
# ==============================
library(ggplot2)
library(dplyr)
# ==============================
# 2. Load and Prepare Data
# ==============================
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)
# Convert Quantity to numeric and filter missing
data_bisnis <- data_bisnis %>%
mutate(Quantity = as.numeric(Quantity)) %>%
filter(!is.na(Quantity))
# Compute IQR-based outlier bounds
Q1 <- quantile(data_bisnis$Quantity, 0.25)
Q3 <- quantile(data_bisnis$Quantity, 0.75)
IQR_value <- IQR(data_bisnis$Quantity)
lower_whisker <- Q1 - 1.5 * IQR_value
upper_whisker <- Q3 + 1.5 * IQR_value
# ==============================
# 3. Summarize Statistics
# ==============================
stats <- data_bisnis %>%
summarise(
Mean = mean(Quantity),
Q1 = Q1,
Median = median(Quantity),
Q3 = Q3,
Min = min(Quantity),
Max = max(Quantity),
Outliers = sum(Quantity < lower_whisker | Quantity > upper_whisker)
)
# ==============================
# 4. Basic Boxplot with Jitter and Annotations
# ==============================
ggplot(data_bisnis, aes(x = factor(1), y = Quantity)) +
geom_boxplot(fill = "skyblue", outlier.shape = NA) +
geom_jitter(aes(color = Quantity < lower_whisker | Quantity > upper_whisker),
width = 0.1, size = 1.5, alpha = 0.5) +
scale_color_manual(values = c("FALSE" = "black", "TRUE" = "red"), guide = "none") +
geom_point(data = data_bisnis %>% filter(Quantity == stats$Max[[1]] & Quantity <= upper_whisker),
aes(x = factor(1), y = Quantity),
color = "red", size = 4) +
ggplot2::annotate("text", x = 1.2, y = stats$Mean[[1]],
label = paste("Mean:", round(stats$Mean[[1]], 2)),
hjust = 0, fontface = "bold", color = "blue", size = 4) +
ggplot2::annotate("text", x = 1.2, y = stats$Q1[[1]],
label = paste("Q1:", round(stats$Q1[[1]], 2)),
hjust = 0, color = "darkgreen", size = 3.5) +
ggplot2::annotate("text", x = 1.2, y = stats$Median[[1]],
label = paste("Median:", round(stats$Median[[1]], 2)),
hjust = 0, color = "purple", size = 3.5) +
ggplot2::annotate("text", x = 1.2, y = stats$Q3[[1]],
label = paste("Q3:", round(stats$Q3[[1]], 2)),
hjust = 0, color = "darkgreen", size = 3.5) +
ggplot2::annotate("text", x = 1.2, y = stats$Min[[1]],
label = paste("Min:", round(stats$Min[[1]], 2)),
hjust = 0, color = "orange", size = 3.5) +
ggplot2::annotate("text", x = 1.2, y = stats$Max[[1]],
label = paste("Max:", round(stats$Max[[1]], 2)),
hjust = 0, color = "orange", size = 3.5) +
ggplot2::annotate("text", x = 1, y = stats$Max[[1]] + 0.05 * stats$Max[[1]],
label = paste("Outliers:", stats$Outliers[[1]]),
color = "red", fontface = "italic", hjust = 0.5, size = 4) +
labs(
title = "Boxplot of Quantity with Jitter",
x = NULL,
y = "Quantity"
) +
theme_minimal() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.title = element_text(size = 20, face = "bold"),
axis.title = element_text(size = 16),
axis.text = element_text(size = 12)
)
2.4 Violin Plot
# ==============================
# 1. Load Libraries
library(ggplot2)
library(dplyr)
# 2. Load and Prepare Data
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)
# Clean and convert Quantity to numeric
data_bisnis <- data_bisnis %>%
mutate(Quantity = as.numeric(Quantity)) %>%
filter(!is.na(Quantity))
# Calculate quartiles and IQR for outlier detection
Q1 <- quantile(data_bisnis$Quantity, 0.25)
Q3 <- quantile(data_bisnis$Quantity, 0.75)
IQR_value <- IQR(data_bisnis$Quantity)
upper_whisker <- Q3 + 1.5 * IQR_value
lower_whisker <- Q1 - 1.5 * IQR_value
# Mark outliers
data_bisnis <- data_bisnis %>%
mutate(is_outlier = ifelse(Quantity < lower_whisker | Quantity > upper_whisker, "Outlier", "Normal"))
# Summarize statistics
stats <- data_bisnis %>%
summarise(
Mean = mean(Quantity),
Q1 = Q1,
Median = median(Quantity),
Q3 = Q3,
Min = min(Quantity),
Max = max(Quantity),
Outliers = sum(is_outlier == "Outlier")
)
# 4. Create Violin Plot
ggplot(data_bisnis, aes(x = factor(1), y = Quantity)) +
geom_violin(fill = "skyblue", trim = FALSE) +
geom_boxplot(width = 0.1, outlier.shape = NA, color = "black") +
geom_jitter(aes(color = is_outlier), width = 0.1, alpha = 0.6, size = 2) +
geom_point(data = data_bisnis %>%
filter(Quantity == stats$Max[[1]] & Quantity <= upper_whisker),
aes(x = factor(1), y = Quantity),
color = "red", size = 8) +
geom_text(data = stats, aes(x = 1.2, y = Mean, label = paste("Mean:", round(Mean, 2))),
hjust = 0, color = "blue", fontface = "bold") +
geom_text(data = stats, aes(x = 1.2, y = Q1, label = paste("Q1:", round(Q1, 2))),
hjust = 0, color = "darkgreen") +
geom_text(data = stats, aes(x = 1.2, y = Median, label = paste("Median:", round(Median, 2))),
hjust = 0, color = "purple") +
geom_text(data = stats, aes(x = 1.2, y = Q3, label = paste("Q3:", round(Q3, 2))),
hjust = 0, color = "darkgreen") +
geom_text(data = stats, aes(x = 1.2, y = Min, label = paste("Min:", round(Min, 2))),
hjust = 0, color = "orange") +
geom_text(data = stats, aes(x = 1.2, y = Max, label = paste("Max:", round(Max, 2))),
hjust = 0, color = "orange") +
geom_text(data = stats, aes(x = 1, y = Max + 0.05 * Max, label = paste("Outliers:", Outliers)),
color = "red", fontface = "italic", hjust = 0.5) +
scale_color_manual(values = c("Normal" = "black", "Outlier" = "red")) +
labs(
title = "Violin Plot of Quantity with Outlier Highlighted",
x = NULL,
y = "Quantity",
color = "Point Type"
) +
theme_minimal(base_size = 15) +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.title = element_text(size = 20, face = "bold"),
axis.title = element_text(size = 10),
axis.text = element_text(size = 10),
legend.position = "right",
legend.title = element_text(size = 10),
legend.text = element_text(size = 10) # <- Bagian yang sebelumnya error
)
3 Combo
3.1 Grouped Bar Chart
# ==============================
# 1. Load Libraries
# ==============================
library(ggplot2)
library(dplyr)
# ==============================
# 2. Load Data
# ==============================
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)
# ==============================
# 3. Data Summarization
# ==============================
sales_summary <- data_bisnis %>%
group_by(Product_Category, Region) %>%
summarise(Total_Sales = sum(Total_Price, na.rm = TRUE), .groups = "drop")
# ==============================
# 4. Plot Grouped Bar Chart
# ==============================
ggplot(sales_summary, aes(x = Product_Category, y = Total_Sales, fill = Region)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(
title = "Total Sales by Product Category and Region",
x = "Product Category",
y = "Total Sales (USD)",
fill = "Region"
) +
theme_minimal(base_size = 10) +
theme(
axis.text.x = element_text(angle = 15, hjust = 1),
plot.title = element_text(face = "bold", hjust = 0.5)
)
3.2 Ridgeline Plot
# ==============================
# 1. Load Libraries
# ==============================
library(ggridges)
library(ggplot2)
library(dplyr)
library(scales)
# ==============================
# 2. Filter Valid Data
# ==============================
# Filter out rows where Price_per_Unit is NA, Inf, or NaN
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)
data_bisnis_filtered <- data_bisnis %>%
filter(is.finite(Price_per_Unit))
# ==============================
# 3. Create Ridgeline Plot
# ==============================
ggplot(data_bisnis_filtered, aes(x = Price_per_Unit, y = Region, fill = Region)) +
geom_density_ridges(alpha = 0.7, scale = 1.2) +
scale_x_continuous(labels = dollar_format(prefix = "Rp", big.mark = ".", decimal.mark = ",")) +
labs(
title = "Distribution of Price per Unit by Region",
x = "Price per Unit",
y = "Region"
) +
theme_minimal() +
theme_minimal(base_size = 15) +
theme(legend.position = "none")
3.3 Boxplot by Category
# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)
# ==============================
# 2. Prepare Data
# ==============================
# Convert Quantity to numeric and remove NA
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)
data_bisnis <- data_bisnis %>%
mutate(Quantity = as.numeric(Quantity)) %>%
filter(!is.na(Quantity))
# ==============================
# 3. Create Boxplot
# ==============================
ggplot(data_bisnis, aes(x = Product_Category, y = Quantity, fill = Product_Category)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 1) + # Boxplot with red outliers
labs(
title = "Boxplot of Quantity by Product Category",
x = "Product Category",
y = "Quantity"
) +
theme_minimal() +
theme_minimal(base_size = 10) +
theme(
plot.title = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 11),
axis.text = element_text(size = 9),
legend.position = "none"
)
3.4 Lolipop Chart
Lolipop Chart(Gabungan)
# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)
# ==============================
# 2. Prepare Data
# ==============================
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)
# Summarize total sales by Product_Category and Region
sales_grouped <- data_bisnis %>%
group_by(Product_Category, Region) %>%
summarise(Total_Sales = sum(Total_Price, na.rm = TRUE), .groups = "drop")
# ==============================
# 3. Grouped Lollipop Chart
# ==============================
ggplot(sales_grouped, aes(x = Total_Sales, y = reorder(Product_Category, Total_Sales), color = Region)) +
geom_segment(aes(x = 0, xend = Total_Sales, y = Product_Category, yend = Product_Category), size = 5) +
geom_point(size = 5) +
labs(
title = "Grouped Lollipop Chart",
x = "Total Sales",
y = "Product Category"
) +
theme_minimal() +
theme_minimal(base_size = 20) +
theme(
axis.text = element_text(size = 6),
axis.title = element_text(size = 7),
plot.title = element_text(size = 7, face = "bold")
)
Lolipop Chart(Yang dipisah)
# ==============================
# 4. Faceted Lollipop Chart
# ==============================
ggplot(sales_grouped, aes(x = Total_Sales, y = reorder(Product_Category, Total_Sales))) +
geom_segment(aes(x = 0, xend = Total_Sales, y = Product_Category, yend = Product_Category), color = "skyblue", size = 5) +
geom_point(color = "blue", size = 5) +
facet_wrap(~ Region, scales = "free_x") +
labs(
title = "Faceted Lollipop Chart",
x = "Total Sales",
y = "Product Category"
) +
theme_minimal() +
theme_minimal(base_size = 20) +
theme(
axis.text = element_text(size = 10),
axis.title = element_text(size = 10),
plot.title = element_text(size = 10, face = "bold")
)
3.5 Heatmap
library(ggplot2)
library(dplyr)
library(readr)
# Baca data
data <- read_csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
# Hitung agregat total sales
agg_data <- data %>%
group_by(Region, Product_Category) %>%
summarise(Total_Sales = sum(Total_Price, na.rm = TRUE)) %>%
ungroup()
# Buat heatmap
ggplot(agg_data, aes(x = Product_Category, y = Region, fill = Total_Sales)) +
geom_tile(color = "white", linewidth = 0.7) +
geom_text(aes(label = round(Total_Sales, 0)), color = "black", size = 4) +
scale_fill_gradient(low = "lightyellow", high = "red", name = "Total Sales") +
labs(
title = "Heatmap Penjualan dengan Penanda Outlier",
x = "Kategori Produk",
y = "Wilayah"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid = element_blank()
)
4 Relationship
4.1 Scatter Plot
# Aktifkan package visualisasi
library(tidyverse)
# Baca file data
data <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
# Buat scatter plot dasar
ggplot(data, aes(x = Quantity, y = Total_Price)) +
geom_point(color = "orange", size = 3, alpha = 0.6) +
labs(
title = "Hubungan Antara Jumlah Produk dan Total Harga",
x = "Jumlah Produk Dibeli (Quantity)",
y = "Total Harga Pembelian (Total_Price)"
) +
theme_minimal()
4.2 Bubble Chart
# Aktifkan package visualisasi
library(tidyverse)
# Baca file data
data <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
# Buat bubble chart
ggplot(data, aes(x = Quantity, y = Total_Price, size = Discount)) +
geom_point(color = "deepskyblue", alpha = 0.5) +
scale_size_continuous(range = c(1, 10)) + # Atur ukuran gelembung
labs(
title = "Bubble Chart: Quantity vs Total Price (Ukuran = Diskon)",
x = "Jumlah Produk Dibeli (Quantity)",
y = "Total Harga Pembelian (Total_Price)",
size = "Diskon"
) +
theme_minimal()
4.3 Correlation Matrix
library(readr)
library(dplyr)
library(ggcorrplot)
data <- read_csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
num_data <- select_if(data, is.numeric)
cor_matrix <- cor(num_data, method = "pearson")
ggcorrplot(
cor_matrix,
method = "square",
type = "full",
lab = TRUE,
lab_size = 2,
colors = c("blue", "white", "red"),
title = "Heatmap Korelasi Pearson",
tl.cex = 10,
show.legend = TRUE,
legend.title = "Pearson Correlation"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 15, face = "bold", hjust = 0.5),
axis.text.x = element_text(angle = 20, hjust = 1),
panel.grid = element_blank()
)
5 Time Series
5.1 Line Chart
library(dplyr)
library(ggplot2)
library(lubridate)
# Load data
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)
# Pastikan Transaction_Date dalam format Date
data_bisnis <- data_bisnis %>%
mutate(Transaction_Date = as.Date(Transaction_Date))
# Hitung Total_Sales per transaksi
data_bisnis <- data_bisnis %>%
mutate(Total_Sales = Unit_Price * Quantity)
# Buat ringkasan total sales per bulan
monthly_sales <- data_bisnis %>%
mutate(year_month = floor_date(Transaction_Date, "month")) %>% # tanggal pertama tiap bulan
group_by(year_month) %>%
summarise(Total_Sales = sum(Total_Sales, na.rm = TRUE), .groups = "drop")
# Buat plot line chart
ggplot(monthly_sales, aes(x = year_month, y = Total_Sales)) +
geom_line(color = "darkorange", size = 1.2) +
geom_point(color = "red", size = 2) +
scale_x_date(
date_labels = "%b %Y",
date_breaks = "2 months"
) +
labs(title = "Total Sales Line Chart Over Time",
x = "Month",
y = "Total Sales") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
5.2 Area Chart
library(dplyr)
library(ggplot2)
library(lubridate)
# Load data
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)
# Pastikan tanggal dalam format Date dan hitung Total_Sales
data_bisnis <- data_bisnis %>%
mutate(Transaction_Date = as.Date(Transaction_Date),
Total_Sales = Unit_Price * Quantity)
# Ringkas total sales per bulan
monthly_sales <- data_bisnis %>%
mutate(year_month = floor_date(Transaction_Date, "month")) %>%
group_by(year_month) %>%
summarise(Total_Sales = sum(Total_Sales, na.rm = TRUE), .groups = "drop")
# Buat area chart
ggplot(monthly_sales, aes(x = year_month, y = Total_Sales)) +
geom_area(fill = "darkorange", alpha = 0.6) + # area warna oranye transparan
geom_line(color = "red", size = 1) + # garis merah di atas area
geom_point(color = "red", size = 2) + # titik merah di tiap bulan
scale_x_date(
date_labels = "%b %Y",
date_breaks = "2 months"
) +
labs(
title = "Total Sales Area Chart Over Time",
x = "Month",
y = "Total Sales"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
---
title: "Tugas Individu "
subtitle: "Tugas Data Science Programming 13"
author: "Olivia Meilinda Davtin Pesireron"
date: "`r format(Sys.Date(), '%B %d, %Y')`"
output:
  rmdformats::readthedown:   # https://github.com/juba/rmdformats
    self_contained: true
    thumbnails: true
    lightbox: true
    gallery: true
    lib_dir: libs
    df_print: "paged"
    code_folding: "show"
    code_download: yes 
    css: "style.css"
---

<img src="livikatanya.jpg" width="300" style="display: block; margin: auto;" alt="Foto Diri">

---

# Data set 

```{r echo=FALSE, message=FALSE, warning=FALSE}

library(readr)
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")

data_bisnis
```
---

# 1 Categorical Data

## 1.1 Bar Chart

```{r, echo=TRUE, message=FALSE, warning=FALSE}
# Load required libraries
library(dplyr)        # For data manipulation
library(ggplot2)      # For creating the bar chart
library(viridis)      # For color palette
library(scales)       # For formatting currency labels

# Step 1: Prepare the data
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
sales_summary <- data_bisnis %>%
  group_by(Product_Category) %>%                              
  summarise(Total_Sales = sum(Total_Price, na.rm = TRUE)) %>% 
  arrange(desc(Total_Sales))                                  

# Step 2: Generate a color palette
custom_colors <- viridis::turbo(n = nrow(sales_summary))      

# Step 3: Create bar chart with value labels
ggplot(sales_summary, aes(x = reorder(Product_Category, -Total_Sales), 
                          y = Total_Sales, 
                          fill = Product_Category)) +
  geom_col(show.legend = FALSE) +                         
  geom_text(aes(label = scales::label_comma(prefix = "Rp ")(Total_Sales)),
            vjust = -0.3, size = 3) +                       
  scale_fill_manual(values = custom_colors) +             
  scale_y_continuous(labels = scales::label_comma(prefix = "Rp "),
                     expand = expansion(mult = c(0, 0.1))) + 
  labs(
    title = "Total Sales by Product Category (2020–2024)",
    subtitle = "Based on Transaction Value",
    x = "Product Category",
    y = "Total Sales",
    caption = "@siregarbakti") +
    theme_minimal(base_size = 10)                             
```



## 1.2 Pie Chart

```{r, echo=TRUE, message=FALSE, warning=FALSE}
# Load necessary libraries
library(dplyr)      # For data manipulation
library(ggplot2)    # For data visualization
library(viridis)    # For color palettes
library(scales)     # For formatting percentages

# Step 1: Summarize total sales by product category
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
sales_summary <- data_bisnis %>%
  group_by(Product_Category) %>%
  summarise(Total_Sales = sum(Total_Price, na.rm = TRUE)) %>%
  arrange(desc(Total_Sales)) %>%
  mutate(
    Percentage = Total_Sales / sum(Total_Sales),# Calculate share
    Label = paste0(Product_Category, "\n",      # Create label with line break
                   scales::percent(Percentage, accuracy = 1)))

# Step 2: Create custom color palette
custom_colors <- viridis::turbo(n = nrow(sales_summary))

# Step 3: Plot donut chart
ggplot(sales_summary, aes(x =2, y = Percentage, fill = Product_Category)) +
  geom_col(width = 1, color = "white", show.legend = FALSE) + # donut slices
  coord_polar(theta = "y") +                    # Convert to circular layout
  geom_text(aes(label = Label),                 # Add labels inside slices
            position = position_stack(vjust = 0.5),
            size = 3, color = "white", fontface = "bold") +
  scale_fill_manual(values = custom_colors) +
  xlim(0.5, 2.5) +                             # Expand size of donut
  labs(
    title = "Sales Distribution by Product Category (2020–2024)",
    subtitle = "Based on Total Transaction Value",
    caption = "@siregarbakti"
  ) +
  theme_void(base_size = 10) +                 # Clean theme
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5), # Centered title
    plot.subtitle = element_text(margin = margin(t = 8, b = 20), hjust = 0.5),
    plot.caption = element_text(margin = margin(t = 15), hjust = 1.5,
                                color = "gray20", face = "italic")
  )
```


## 1.3 Word Could

```{r, echo=TRUE, message=FALSE, warning=FALSE}
# ==============================
# 1. Install & Load Required Packages
# ==============================
packages <- c("dplyr", "tm", "wordcloud", "RColorBrewer")
new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
if(length(new_packages)) install.packages(new_packages)

library(dplyr)
library(tm)
library(wordcloud)
library(RColorBrewer)

# ==============================
# 2. Read and Combine Text Columns
# ==============================
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")

# Combine text columns into one
text_data <- paste(data_bisnis$Product_Category,
                   data_bisnis$Region,
                   data_bisnis$Sales_Channel,
                   sep = " ")

# ==============================
# 3. Clean and Prepare Text
# ==============================
corpus <- VCorpus(VectorSource(text_data))

corpus_clean <- corpus %>%
  tm_map(content_transformer(tolower)) %>%  # convert to lowercase
  tm_map(removePunctuation) %>%             # remove punctuation
  tm_map(removeNumbers) %>%                 # remove numbers
  tm_map(removeWords, stopwords("english")) %>%  # remove English stopwords
  tm_map(stripWhitespace)                   # remove extra whitespace

# Remove empty documents (if any)
non_empty_idx <- sapply(corpus_clean, function(doc) {
  nchar(content(doc)) > 0
})
corpus_clean <- corpus_clean[non_empty_idx]

# ==============================
# 4. Create Term-Document Matrix & Word Frequencies
# ==============================
tdm <- TermDocumentMatrix(corpus_clean)
m <- as.matrix(tdm)
word_freqs <- sort(rowSums(m), decreasing = TRUE)
df_words <- data.frame(word = names(word_freqs), freq = word_freqs)

# ==============================
# 5. Generate Word Cloud (Full Screen)
# ==============================
set.seed(123)
wordcloud(words = df_words$word,
          freq = df_words$freq,
          scale = c(4, 4),       # adjust for large size
          min.freq = 1,
          max.words = 300,
          random.order = FALSE,
          rot.per = 0.3,
          colors = brewer.pal(8, "Dark2"))
```


## 1.4 Treemap


```{r, echo=TRUE, message=FALSE, warning=FALSE}
# ==============================
# 1. Install & Load Required Packages
# ==============================
packages <- c("treemapify", "dplyr", "ggplot2")
new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load libraries
library(treemapify)
library(ggplot2)
library(dplyr)

# ==============================
# 2. Prepare Aggregated Treemap Data
# ==============================
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
tree_data <- data_bisnis %>%
  group_by(Product_Category, Region) %>%
  summarise(
    Total_Sales = sum(Total_Price, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    label_combined = paste0(Region, "\n", round(Total_Sales, 0))
  )

# ==============================
# 3. Create Static Tree Map with Combined Labels
# ==============================

ggplot(tree_data, aes(
  area = Total_Sales,
  fill = Product_Category,
  subgroup = Product_Category
)) +
  geom_treemap() +
  geom_treemap_subgroup_border(color = "white") +

  geom_treemap_text(
    aes(label = label_combined),
    colour = "white",
    place = "centre",
    grow = FALSE,
    reflow = TRUE,
    size = 20 / .pt,       # Adjust overall font size
    min.size = 3
  ) +

  labs(
    title = "Tree Map of Total Sales by Product Category and Region"
  ) +
  theme_minimal()
```


---

# 2 Numerical Data

## 2.1 Histogram

```{r, echo=TRUE, message=FALSE, warning=FALSE}
# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
data_bisnis <- data_bisnis %>%
  mutate(Quantity = as.numeric(Quantity))

# ==============================
# 3. Create Histogram of Quantity with Custom Font Sizes
# ==============================
ggplot(data_bisnis, aes(x = Quantity)) +
  geom_histogram(binwidth = 1,
                 fill = "skyblue",
                 color = "gray",
                 alpha = 0.7) +
  labs(
    title = "Histogram of Quantity Distribution",
    x = "Quantity",
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 15, face = "bold"),  # Title size and bold
    axis.title.x = element_text(size = 10),               # X label size
    axis.title.y = element_text(size = 10),               # Y label size
    axis.text.x = element_text(size = 9),                # X axis numbers size
    axis.text.y = element_text(size = 9)                 # Y axis numbers size
  )
```

## 2.2 Density Plot

```{r, echo=TRUE, message=FALSE, warning=FALSE}
# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")

# Ensure Quantity is numeric and remove NAs
data_bisnis <- data_bisnis %>%
  mutate(Quantity = as.numeric(Quantity)) %>%
  filter(!is.na(Quantity))

# Calculate mean of Quantity
mean_quantity <- mean(data_bisnis$Quantity, na.rm = TRUE)

# Estimate density to get y-position for label
density_data <- density(data_bisnis$Quantity)
max_y <- max(density_data$y)

# ==============================
# 3. Create Density Plot with Mean Line and Label
# ==============================
ggplot(data_bisnis, aes(x = Quantity)) +
  geom_density(fill = "skyblue", alpha = 0.6) +
  geom_vline(xintercept = mean_quantity, color = "red", 
             linetype = "dashed", linewidth = 1) +
  geom_text(
    data = data.frame(x = mean_quantity, y = max_y * 0.8),
    aes(x = x, y = y),
    label = paste("Mean =", round(mean_quantity, 2)),
    color = "black",
    angle = 90,
    vjust = -0.5,
    size = 3,  # <= Ubah ukuran label jadi lebih kecil & enak dilihat
    fontface = "bold",
    inherit.aes = FALSE
  ) +
  labs(
    title = "Density Plot of Quantity with Mean",
    x = "Quantity",
    y = "Density"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),   # <= dari 25 ke 16
    axis.title = element_text(size = 12),                  # <= dari 20 ke 12
    axis.text = element_text(size = 10)                    # <= dari 15 ke 10
  )

```

## 2.3 Box Plot

```{r, echo=TRUE, message=FALSE, warning=FALSE}
# ==============================
# 1. Load Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Load and Prepare Data
# ==============================
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)

# Convert Quantity to numeric and filter missing
data_bisnis <- data_bisnis %>%
  mutate(Quantity = as.numeric(Quantity)) %>%
  filter(!is.na(Quantity))

# Compute IQR-based outlier bounds
Q1 <- quantile(data_bisnis$Quantity, 0.25)
Q3 <- quantile(data_bisnis$Quantity, 0.75)
IQR_value <- IQR(data_bisnis$Quantity)
lower_whisker <- Q1 - 1.5 * IQR_value
upper_whisker <- Q3 + 1.5 * IQR_value

# ==============================
# 3. Summarize Statistics
# ==============================
stats <- data_bisnis %>%
  summarise(
    Mean = mean(Quantity),
    Q1 = Q1,
    Median = median(Quantity),
    Q3 = Q3,
    Min = min(Quantity),
    Max = max(Quantity),
    Outliers = sum(Quantity < lower_whisker | Quantity > upper_whisker)
  )

# ==============================
# 4. Basic Boxplot with Jitter and Annotations
# ==============================
ggplot(data_bisnis, aes(x = factor(1), y = Quantity)) +
  geom_boxplot(fill = "skyblue", outlier.shape = NA) +
  
  geom_jitter(aes(color = Quantity < lower_whisker | Quantity > upper_whisker),
              width = 0.1, size = 1.5, alpha = 0.5) +
  scale_color_manual(values = c("FALSE" = "black", "TRUE" = "red"), guide = "none") +
  
  geom_point(data = data_bisnis %>% filter(Quantity == stats$Max[[1]] & Quantity <= upper_whisker),
             aes(x = factor(1), y = Quantity),
             color = "red", size = 4) +
  
  ggplot2::annotate("text", x = 1.2, y = stats$Mean[[1]], 
           label = paste("Mean:", round(stats$Mean[[1]], 2)), 
           hjust = 0, fontface = "bold", color = "blue", size = 4) +
  ggplot2::annotate("text", x = 1.2, y = stats$Q1[[1]], 
           label = paste("Q1:", round(stats$Q1[[1]], 2)), 
           hjust = 0, color = "darkgreen", size = 3.5) +
  ggplot2::annotate("text", x = 1.2, y = stats$Median[[1]], 
           label = paste("Median:", round(stats$Median[[1]], 2)), 
           hjust = 0, color = "purple", size = 3.5) +
  ggplot2::annotate("text", x = 1.2, y = stats$Q3[[1]], 
           label = paste("Q3:", round(stats$Q3[[1]], 2)), 
           hjust = 0, color = "darkgreen", size = 3.5) +
  ggplot2::annotate("text", x = 1.2, y = stats$Min[[1]], 
           label = paste("Min:", round(stats$Min[[1]], 2)), 
           hjust = 0, color = "orange", size = 3.5) +
  ggplot2::annotate("text", x = 1.2, y = stats$Max[[1]], 
           label = paste("Max:", round(stats$Max[[1]], 2)), 
           hjust = 0, color = "orange", size = 3.5) +
  ggplot2::annotate("text", x = 1, y = stats$Max[[1]] + 0.05 * stats$Max[[1]], 
           label = paste("Outliers:", stats$Outliers[[1]]), 
           color = "red", fontface = "italic", hjust = 0.5, size = 4) +

  labs(
    title = "Boxplot of Quantity with Jitter",
    x = NULL,
    y = "Quantity"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.title = element_text(size = 20, face = "bold"),
    axis.title = element_text(size = 16),
    axis.text = element_text(size = 12)
  )

```

## 2.4 Violin Plot

```{r, echo=TRUE, message=FALSE, warning=FALSE}
# ==============================
# 1. Load Libraries
library(ggplot2)
library(dplyr)

# 2. Load and Prepare Data
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)

# Clean and convert Quantity to numeric
data_bisnis <- data_bisnis %>%
  mutate(Quantity = as.numeric(Quantity)) %>%
  filter(!is.na(Quantity))

# Calculate quartiles and IQR for outlier detection
Q1 <- quantile(data_bisnis$Quantity, 0.25)
Q3 <- quantile(data_bisnis$Quantity, 0.75)
IQR_value <- IQR(data_bisnis$Quantity)
upper_whisker <- Q3 + 1.5 * IQR_value
lower_whisker <- Q1 - 1.5 * IQR_value

# Mark outliers
data_bisnis <- data_bisnis %>%
  mutate(is_outlier = ifelse(Quantity < lower_whisker | Quantity > upper_whisker, "Outlier", "Normal"))

# Summarize statistics
stats <- data_bisnis %>%
  summarise(
    Mean = mean(Quantity),
    Q1 = Q1,
    Median = median(Quantity),
    Q3 = Q3,
    Min = min(Quantity),
    Max = max(Quantity),
    Outliers = sum(is_outlier == "Outlier")
  )

# 4. Create Violin Plot
ggplot(data_bisnis, aes(x = factor(1), y = Quantity)) +
  geom_violin(fill = "skyblue", trim = FALSE) +
  geom_boxplot(width = 0.1, outlier.shape = NA, color = "black") +
  geom_jitter(aes(color = is_outlier), width = 0.1, alpha = 0.6, size = 2) +
  geom_point(data = data_bisnis %>%
               filter(Quantity == stats$Max[[1]] & Quantity <= upper_whisker),
             aes(x = factor(1), y = Quantity),
             color = "red", size = 8) +

  geom_text(data = stats, aes(x = 1.2, y = Mean, label = paste("Mean:", round(Mean, 2))),
            hjust = 0, color = "blue", fontface = "bold") +
  geom_text(data = stats, aes(x = 1.2, y = Q1, label = paste("Q1:", round(Q1, 2))),
            hjust = 0, color = "darkgreen") +
  geom_text(data = stats, aes(x = 1.2, y = Median, label = paste("Median:", round(Median, 2))),
            hjust = 0, color = "purple") +
  geom_text(data = stats, aes(x = 1.2, y = Q3, label = paste("Q3:", round(Q3, 2))),
            hjust = 0, color = "darkgreen") +
  geom_text(data = stats, aes(x = 1.2, y = Min, label = paste("Min:", round(Min, 2))),
            hjust = 0, color = "orange") +
  geom_text(data = stats, aes(x = 1.2, y = Max, label = paste("Max:", round(Max, 2))),
            hjust = 0, color = "orange") +
  geom_text(data = stats, aes(x = 1, y = Max + 0.05 * Max, label = paste("Outliers:", Outliers)),
            color = "red", fontface = "italic", hjust = 0.5) +

  scale_color_manual(values = c("Normal" = "black", "Outlier" = "red")) +

  labs(
    title = "Violin Plot of Quantity with Outlier Highlighted",
    x = NULL,
    y = "Quantity",
    color = "Point Type"
  ) +
  theme_minimal(base_size = 15) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    plot.title = element_text(size = 20, face = "bold"),
    axis.title = element_text(size = 10),
    axis.text = element_text(size = 10),
    legend.position = "right",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 10)  # <- Bagian yang sebelumnya error
  )

```

---

# 3 Combo

## 3.1 Grouped Bar Chart

```{r, echo=TRUE, message=FALSE, warning=FALSE}
# ==============================
# 1. Load Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Load Data
# ==============================
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)

# ==============================
# 3. Data Summarization
# ==============================
sales_summary <- data_bisnis %>%
  group_by(Product_Category, Region) %>%
  summarise(Total_Sales = sum(Total_Price, na.rm = TRUE), .groups = "drop")

# ==============================
# 4. Plot Grouped Bar Chart
# ==============================
ggplot(sales_summary, aes(x = Product_Category, y = Total_Sales, fill = Region)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(
    title = "Total Sales by Product Category and Region",
    x = "Product Category",
    y = "Total Sales (USD)",
    fill = "Region"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    axis.text.x = element_text(angle = 15, hjust = 1),
    plot.title = element_text(face = "bold", hjust = 0.5)
  )
```

## 3.2 Ridgeline Plot

```{r, echo=TRUE, message=FALSE, warning=FALSE}
# ==============================
# 1. Load Libraries
# ==============================
library(ggridges)
library(ggplot2)
library(dplyr)
library(scales)

# ==============================
# 2. Filter Valid Data
# ==============================
# Filter out rows where Price_per_Unit is NA, Inf, or NaN
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)

data_bisnis_filtered <- data_bisnis %>%
  filter(is.finite(Price_per_Unit))

# ==============================
# 3. Create Ridgeline Plot
# ==============================
ggplot(data_bisnis_filtered, aes(x = Price_per_Unit, y = Region, fill = Region)) +
  geom_density_ridges(alpha = 0.7, scale = 1.2) +
  scale_x_continuous(labels = dollar_format(prefix = "Rp", big.mark = ".", decimal.mark = ",")) +
  labs(
    title = "Distribution of Price per Unit by Region",
    x = "Price per Unit",
    y = "Region"
  ) +
  theme_minimal() +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none")
```

## 3.3 Boxplot by Category

```{r, echo=TRUE, message=FALSE, warning=FALSE}
# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Prepare Data
# ==============================
# Convert Quantity to numeric and remove NA
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)
data_bisnis <- data_bisnis %>%
  mutate(Quantity = as.numeric(Quantity)) %>%
  filter(!is.na(Quantity))

# ==============================
# 3. Create Boxplot
# ==============================
ggplot(data_bisnis, aes(x = Product_Category, y = Quantity, fill = Product_Category)) +
  geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 1) +  # Boxplot with red outliers
  labs(
    title = "Boxplot of Quantity by Product Category",
    x = "Product Category",
    y = "Quantity"
  ) +
  theme_minimal() +
  theme_minimal(base_size = 10) +
  theme(
    plot.title = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 11),
    axis.text = element_text(size = 9),
    legend.position = "none"
  )
```

## 3.4 Lolipop Chart

### Lolipop Chart(Gabungan)

```{r, echo=TRUE, message=FALSE, warning=FALSE}
# ==============================
# 1. Load Required Libraries
# ==============================
library(ggplot2)
library(dplyr)

# ==============================
# 2. Prepare Data
# ==============================
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)

# Summarize total sales by Product_Category and Region
sales_grouped <- data_bisnis %>%
  group_by(Product_Category, Region) %>%
  summarise(Total_Sales = sum(Total_Price, na.rm = TRUE), .groups = "drop")

# ==============================
# 3. Grouped Lollipop Chart
# ==============================
ggplot(sales_grouped, aes(x = Total_Sales, y = reorder(Product_Category, Total_Sales), color = Region)) +
  geom_segment(aes(x = 0, xend = Total_Sales, y = Product_Category, yend = Product_Category), size = 5) +
  geom_point(size = 5) +
  labs(
    title = "Grouped Lollipop Chart",
    x = "Total Sales",
    y = "Product Category"
  ) +
  theme_minimal() +
  theme_minimal(base_size = 20) +
  theme(
    axis.text = element_text(size = 6),
    axis.title = element_text(size = 7),
    plot.title = element_text(size = 7, face = "bold")
  )
```

### Lolipop Chart(Yang dipisah)

```{r, echo=TRUE, message=FALSE, warning=FALSE}
# ==============================
# 4. Faceted Lollipop Chart
# ==============================
ggplot(sales_grouped, aes(x = Total_Sales, y = reorder(Product_Category, Total_Sales))) +
  geom_segment(aes(x = 0, xend = Total_Sales, y = Product_Category, yend = Product_Category), color = "skyblue", size = 5) +
  geom_point(color = "blue", size = 5) +
  facet_wrap(~ Region, scales = "free_x") +
  labs(
    title = "Faceted Lollipop Chart",
    x = "Total Sales",
    y = "Product Category"
  ) +
  theme_minimal() +
  theme_minimal(base_size = 20) +
  theme(
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 10),
    plot.title = element_text(size = 10, face = "bold")
  )
```

## 3.5 Heatmap

```{r, echo=TRUE, message=FALSE, warning=FALSE}
library(ggplot2)
library(dplyr)
library(readr)

# Baca data
data <- read_csv("8 Descriptive Visualizations – Data Science Programming (1).csv")

# Hitung agregat total sales
agg_data <- data %>%
  group_by(Region, Product_Category) %>%
  summarise(Total_Sales = sum(Total_Price, na.rm = TRUE)) %>%
  ungroup()


# Buat heatmap
ggplot(agg_data, aes(x = Product_Category, y = Region, fill = Total_Sales)) +
  geom_tile(color = "white", linewidth = 0.7) +
  geom_text(aes(label = round(Total_Sales, 0)), color = "black", size = 4) +
  scale_fill_gradient(low = "lightyellow", high = "red", name = "Total Sales") +
  labs(
    title = "Heatmap Penjualan dengan Penanda Outlier",
    x = "Kategori Produk",
    y = "Wilayah"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid = element_blank()
  )
```

---

# 4 Relationship

## 4.1 Scatter Plot

```{r, echo=TRUE, message=FALSE, warning=FALSE}
# Aktifkan package visualisasi
library(tidyverse)

# Baca file data
data <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")

# Buat scatter plot dasar
ggplot(data, aes(x = Quantity, y = Total_Price)) +
  geom_point(color = "orange", size = 3, alpha = 0.6) +
  labs(
    title = "Hubungan Antara Jumlah Produk dan Total Harga",
    x = "Jumlah Produk Dibeli (Quantity)",
    y = "Total Harga Pembelian (Total_Price)"
  ) +
  theme_minimal()

```

## 4.2 Bubble Chart

```{r, echo=TRUE, message=FALSE, warning=FALSE}
# Aktifkan package visualisasi
library(tidyverse)

# Baca file data
data <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv")

# Buat bubble chart
ggplot(data, aes(x = Quantity, y = Total_Price, size = Discount)) +
  geom_point(color = "deepskyblue", alpha = 0.5) +
  scale_size_continuous(range = c(1, 10)) +  # Atur ukuran gelembung
  labs(
    title = "Bubble Chart: Quantity vs Total Price (Ukuran = Diskon)",
    x = "Jumlah Produk Dibeli (Quantity)",
    y = "Total Harga Pembelian (Total_Price)",
    size = "Diskon"
  ) +
  theme_minimal()

```

## 4.3 Correlation Matrix

```{r, echo=TRUE, message=FALSE, warning=FALSE}
library(readr)
library(dplyr)
library(ggcorrplot)

data <- read_csv("8 Descriptive Visualizations – Data Science Programming (1).csv")
num_data <- select_if(data, is.numeric)
cor_matrix <- cor(num_data, method = "pearson")

ggcorrplot(
  cor_matrix,
  method = "square",
  type = "full",
  lab = TRUE,
  lab_size = 2,
  colors = c("blue", "white", "red"),
  title = "Heatmap Korelasi Pearson",
  tl.cex = 10,
  show.legend = TRUE,
  legend.title = "Pearson Correlation"
) + 
  theme_minimal() +
  theme(
    plot.title = element_text(size = 15, face = "bold", hjust = 0.5),
    axis.text.x = element_text(angle = 20, hjust = 1),
    panel.grid = element_blank()
  )

```


---

# 5 Time Series

## 5.1 Line Chart

```{r, echo=TRUE, message=FALSE, warning=FALSE}
library(dplyr)
library(ggplot2)
library(lubridate)

# Load data
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)

# Pastikan Transaction_Date dalam format Date
data_bisnis <- data_bisnis %>%
  mutate(Transaction_Date = as.Date(Transaction_Date))

# Hitung Total_Sales per transaksi
data_bisnis <- data_bisnis %>%
  mutate(Total_Sales = Unit_Price * Quantity)

# Buat ringkasan total sales per bulan
monthly_sales <- data_bisnis %>%
  mutate(year_month = floor_date(Transaction_Date, "month")) %>%   # tanggal pertama tiap bulan
  group_by(year_month) %>%
  summarise(Total_Sales = sum(Total_Sales, na.rm = TRUE), .groups = "drop")

# Buat plot line chart
ggplot(monthly_sales, aes(x = year_month, y = Total_Sales)) +
  geom_line(color = "darkorange", size = 1.2) +  
  geom_point(color = "red", size = 2) +          
  scale_x_date(
    date_labels = "%b %Y", 
    date_breaks = "2 months"
  ) +
  labs(title = "Total Sales Line Chart Over Time",
       x = "Month",
       y = "Total Sales") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

```

## 5.2 Area Chart

```{r}
library(dplyr)
library(ggplot2)
library(lubridate)

# Load data
data_bisnis <- read.csv("8 Descriptive Visualizations – Data Science Programming (1).csv", stringsAsFactors = FALSE)

# Pastikan tanggal dalam format Date dan hitung Total_Sales
data_bisnis <- data_bisnis %>%
  mutate(Transaction_Date = as.Date(Transaction_Date),
         Total_Sales = Unit_Price * Quantity)

# Ringkas total sales per bulan
monthly_sales <- data_bisnis %>%
  mutate(year_month = floor_date(Transaction_Date, "month")) %>%
  group_by(year_month) %>%
  summarise(Total_Sales = sum(Total_Sales, na.rm = TRUE), .groups = "drop")

# Buat area chart
ggplot(monthly_sales, aes(x = year_month, y = Total_Sales)) +
  geom_area(fill = "darkorange", alpha = 0.6) +    # area warna oranye transparan
  geom_line(color = "red", size = 1) +             # garis merah di atas area
  geom_point(color = "red", size = 2) +            # titik merah di tiap bulan
  scale_x_date(
    date_labels = "%b %Y",
    date_breaks = "2 months"
  ) +
  labs(
    title = "Total Sales Area Chart Over Time",
    x = "Month",
    y = "Total Sales"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

```

