Data Science Programming
Sub-Bab 09
## X Transaction_ID Transaction_Date Customer_ID Product_Category Product_ID
## 1 1 7zmPHxF7XfN9 2021-07-14 BAl3Y7yxev Clothing P0370
## 2 2 y4bCY9pKTBWU 2020-11-16 TYY0h5C190 Electronics P0185
## 3 3 8k0B7XX19Ykf 2023-03-22 nUX640AaXg Home P0443
## 4 4 l8ahQz5YNOKz 2023-01-02 sBZyUSJLEP Home P0035
## 5 5 kmufgw8wx5qk 2023-06-05 GMfVH2ZWNX Groceries P0375
## 6 6 aI0KADT0mn7C 2023-03-15 YxqAmfTU9M Clothing P0447
## Quantity Unit_Price Discount Region Sales_Channel Delivery_Time Total_Price
## 1 2 15.18 0.00 North Online 5 30.36
## 2 5 10.22 0.15 West Offline 2 43.44
## 3 3 17.74 0.05 West Online 8 50.56
## 4 6 28.30 0.22 North Offline 8 132.44
## 5 3 11.91 0.13 North Offline 7 31.09
## 6 1 5.43 0.07 North Offline 9 5.05
## Price_per_Unit Efficiency Feature_Interaction Cross_Term ID_Prefix
## 1 15.18000 0.4000000 0.00 Clothing_North P0
## 2 8.68800 2.5000000 0.75 Electronics_West P0
## 3 16.85333 0.3750000 0.15 Home_West P0
## 4 22.07333 0.7500000 1.32 Home_North P0
## 5 10.36333 0.4285714 0.39 Groceries_North P0
## 6 5.05000 0.1111111 0.07 Clothing_North P0
## ID_Length ID_HasPattern Discount_Level Sales_Rank Avg_Quantity_Region
## 1 5 true 1 253 2.992908
## 2 5 true 2 157 3.116667
## 3 5 true 1 121 3.116667
## 4 5 true 3 4 2.992908
## 5 5 true 2 246 2.992908
## 6 5 true 1 466 2.992908
## Sum_Sales_Region Count_Product_Region
## 1 4864.94 141
## 2 4169.72 120
## 3 4169.72 120
## 4 4864.94 141
## 5 4864.94 141
## 6 4864.94 141
HeadMap
# Impor Data
data_bisnis <- read.csv("data_bisnis.csv")
# Data untuk membuat HeadMap
# Gunakan Region sebagai dan Product_Category
tasks <- unique(data_bisnis$Region)
models <- unique(data_bisnis$Product_Category)
# Buat matriks performa berdasarkan rata-rata Total_Price
performance <- matrix(0, nrow = length(models), ncol = length(tasks))
rownames(performance) <- models
colnames(performance) <- tasks
# Isi matriks dengan rata-rata Total_Price
for (i in seq_along(models)) {
for (j in seq_along(tasks)) {
subset <- data_bisnis[data_bisnis$Product_Category == models[i] & data_bisnis$Region == tasks[j], ]
if (nrow(subset) > 0) {
performance[i, j] <- mean(subset$Total_Price, na.rm = TRUE)
} else {
performance[i, j] <- 0
}
}
}
# Normalisasi ke rentang 0-100
min_val <- min(performance, na.rm = TRUE)
max_val <- max(performance, na.rm = TRUE)
performance <- ((performance - min_val) / (max_val - min_val)) * 100
# Ubah ke format long untuk ggplot
performance_df <- as.data.frame(performance)
performance_df$Product_Category <- rownames(performance_df)
performance_long <- melt(performance_df, id.vars = "Product_Category", variable.name = "Region", value.name = "Score")
# Pastikan kolom adalah tipe yang tepat
performance_long$Product_Category <- as.factor(performance_long$Product_Category)
performance_long$Region <- as.factor(performance_long$Region)
performance_long$Score <- as.numeric(performance_long$Score)
# Membuat HeadMap
ggplot(performance_long, aes(x = Region, y = Product_Category, fill = Score)) +
geom_tile(color = "white") +
geom_text(aes(label = sprintf("%.1f", Score)), color = "black", size = 3) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 50,
limits = c(0, 100), name = "Skor (%)") +
labs(title = "Rata-rata Total Price per Region dan Kategori Produk",
x = "Region (Tugas)", y = "Kategori Produk (Model)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"))Scatter Plot
# Impor Data
data_bisnis <- read.csv("data_bisnis.csv")
# Membuat Scatter Plot
ggplot(data_bisnis, aes(x = Quantity, y = Total_Price, color = Product_Category)) +
geom_point(alpha = 0.6) +
labs(title = "Quantity vs Total Price by Product Category",
x = "Quantity", y = "Total Price") +
theme_minimal() +
theme(plot.title = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 12)) +
scale_color_discrete(name = "Product Category")Bubble Chart
# Impor Data
data_bisnis <- read.csv("data_bisnis.csv")
# Membuat Bubble Chart
ggplot(data_bisnis, aes(x = Quantity, y = Total_Price, size = Discount, color = Region)) +
geom_point(alpha = 0.6) +
scale_size(range = c(2, 10)) +
labs(title = "Quantity vs Total Price by Region",
x = "Quantity", y = "Total Price") +
theme_minimal() +
theme(plot.title = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 12)) +
scale_color_discrete(name = "Region")Correlation Matriks
# Impor Data
data_bisnis <- read.csv("data_bisnis.csv")
# Membuat Correlation Matriks
numeric_cols <- data_bisnis[, c("Quantity", "Unit_Price", "Discount", "Delivery_Time", "Total_Price")]
cor_matrix <- cor(numeric_cols, use = "complete.obs")
# Ubah ke format long untuk ggplot
cor_matrix_melted <- melt(cor_matrix)
ggplot(cor_matrix_melted, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = sprintf("%.2f", value)), color = "black", size = 4) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, limits = c(-1, 1), name = "Correlation") +
labs(title = "Correlation Matrix of Numeric Variables",
x = "", y = "") +
theme_minimal() +
theme(plot.title = element_text(size = 16, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1))Line Chart
# Impor Data
data_bisnis <- read.csv("data_bisnis.csv")
# Pastikan kolom tanggal dalam format datetime
data_bisnis$Transaction_Date <- as.Date(data_bisnis$Transaction_Date, format = "%Y-%m-%d")
# Hitung Total_Price
data_bisnis$Total_Price <- data_bisnis$Quantity * data_bisnis$Unit_Price * (1 - data_bisnis$Discount)
# Filter data untuk 5 tahun terakhir (Mei 2020 - Mei 2025)
start_date <- as.Date("2020-05-01")
end_date <- as.Date("2025-05-31")
filtered_data <- data_bisnis %>%
filter(Transaction_Date >= start_date & Transaction_Date <= end_date)
# Buat kolom periode 6 bulan
filtered_data <- filtered_data %>%
mutate(Half_Year = ifelse(month(Transaction_Date) <= 6, year(Transaction_Date), year(Transaction_Date)))
# Rata - rata total price
half_yearly_category_sales <- filtered_data %>%
group_by(Product_Category, Half_Year) %>%
summarise(Avg_Total_Price = mean(Total_Price, na.rm = TRUE))
# Membuat Line Chart
ggplot(half_yearly_category_sales, aes(x = Half_Year, y = Avg_Total_Price, color = Product_Category, group = Product_Category)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(title = "Half-Yearly Average Sales by Product Category (2020-2025)",
x = "Half Year", y = "Average Total Price") +
theme_minimal() +
theme(plot.title = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 14),
axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_color_discrete(name = "Product Category") +
scale_x_continuous(breaks = unique(half_yearly_category_sales$Half_Year))Area Chart
# Impor Data
data_bisnis <- read.csv("data_bisnis.csv")
# Pastikan kolom tanggal dalam format datetime
data_bisnis$Transaction_Date <- as.Date(data_bisnis$Transaction_Date, format = "%Y-%m-%d")
# Hitung Total_Price
data_bisnis$Total_Price <- data_bisnis$Quantity * data_bisnis$Unit_Price * (1 - data_bisnis$Discount)
# Filter data untuk 5 tahun terakhir (Mei 2020 - Mei 2025)
start_date <- as.Date("2020-05-01")
end_date <- as.Date("2025-05-31")
filtered_data <- data_bisnis %>%
filter(Transaction_Date >= start_date & Transaction_Date <= end_date)
# Buat kolom periode 6 bulan
filtered_data <- filtered_data %>%
mutate(Half_Year = ifelse(month(Transaction_Date) <= 6, year(Transaction_Date), year(Transaction_Date)))
# Rata - rata total price
half_yearly_category_sales <- filtered_data %>%
group_by(Product_Category, Half_Year) %>%
summarise(Avg_Total_Price = mean(Total_Price, na.rm = TRUE))
# Area Chart
ggplot(half_yearly_category_sales, aes(x = Half_Year, y = Avg_Total_Price, fill = Product_Category, group = Product_Category)) +
geom_area(alpha = 0.6, position = "stack") +
labs(title = "Half-Yearly Average Sales by Product Category (2020-2025)",
x = "Half Year", y = "Average Total Price") +
theme_minimal() +
theme(plot.title = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 14),
axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_discrete(name = "Product Category") +
scale_x_continuous(breaks = unique(half_yearly_category_sales$Half_Year))