Pemprogram Sains Data

UTS Pemprograman Sains Data

Logo

1 Heatmap

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(RColorBrewer)

data_bisnis <- read.csv("Latihan bab 8.csv")

sales_grouped <- data_bisnis %>%
  group_by(Product_Category, Region) %>%
  summarise(Total_Price = sum(Total_Price, na.rm = TRUE), .groups = "drop")

ggplot(sales_grouped, aes(x = Region, y = Product_Category, fill = Total_Price)) +
  geom_tile(color = "white") +
  geom_text(aes(label = round(Total_Price, 0)), color = "black", size = 4) +
  scale_fill_gradientn(colors = brewer.pal(9, "Blues"), name = "Total Sales") +
  labs(
    title = "Heatmap Total Sales by Product Category and Region",
    x = "Region",
    y = "Product Category"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5)
  )


2 Scatter Plot

library(ggplot2)
library(dplyr)

data_bisnis <- read.csv("Latihan bab 8.csv", stringsAsFactors = FALSE)

# Pastikan data bersih
data_bisnis <- data_bisnis %>%
  mutate(
    Quantity = as.numeric(Quantity),
    Total_Price = as.numeric(Total_Price)
  ) %>%
  filter(!is.na(Quantity), !is.na(Total_Price))

# Scatter plot dengan legend di luar
ggplot(data_bisnis, aes(x = Quantity, y = Total_Price, color = Product_Category)) +
  geom_point(alpha = 0.5, size = 1.8) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 1.2) +
  labs(
    title = "Scatter Plot dengan Garis Tren: Total Price vs Quantity",
    x = "Quantity",
    y = "Total Price",
    color = "Product Category"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 11),
    legend.position = "right"     # pindahkan legend ke kanan (tidak menutupi plot)
  )
## `geom_smooth()` using formula = 'y ~ x'


3 Bubble Chart

# Load libraries
library(ggplot2)
library(dplyr)
library(readr)
## Warning: package 'readr' was built under R version 4.4.2
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
# Load data
df <- read_csv("Latihan bab 8.csv")
## New names:
## • `` -> `...1`
## Rows: 500 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): Transaction_ID, Customer_ID, Product_Category, Product_ID, Region...
## dbl  (15): ...1, Quantity, Unit_Price, Discount, Delivery_Time, Total_Price,...
## lgl   (1): ID_HasPattern
## date  (1): Transaction_Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert to numeric
df$Quantity <- as.numeric(df$Quantity)
df$Unit_Price <- as.numeric(df$Unit_Price)

# Hitung Total_Price
df <- df %>%
  mutate(Total_Price = Quantity * Unit_Price) %>%
  filter(!is.na(Quantity), !is.na(Unit_Price), !is.na(Total_Price), !is.na(Region))

# Agregasi data
agg <- df %>%
  group_by(Product_Category, Region) %>%
  summarise(
    Quantity = sum(Quantity, na.rm = TRUE),
    Total_Price = sum(Total_Price, na.rm = TRUE),
    .groups = 'drop'
  )

# Tambahkan x_base dan jitter
agg <- agg %>%
  mutate(
    x_base = as.numeric(as.factor(Product_Category)),
    x_jitter = x_base + runif(n(), -0.2, 0.2)
  )

# Skala ukuran bubble
max_size <- 25  # ukuran relatif (bukan pixel seperti matplotlib)
min_size <- 4
scaled <- rescale(agg$Total_Price, to = c(min_size, max_size))
agg$bubble_size <- scaled

# Plot
ggplot(agg, aes(x = x_jitter, y = Quantity, size = bubble_size, fill = Region)) +
  geom_point(shape = 21, color = "black", alpha = 0.6) +
  geom_text(aes(label = Product_Category), color = "black", fontface = "bold", size = 3, vjust = 0.5, hjust = 0.5) +
  scale_size_identity() +
  scale_fill_brewer(palette = "Set2") +
  scale_x_continuous(
    breaks = unique(agg$x_base),
    labels = levels(as.factor(agg$Product_Category))
  ) +
  labs(
    title = "4D Bubble Chart: Quantity vs Product Category\n(Bubble = Total Price, Color = Region)",
    x = "Product Category",
    y = "Total Quantity",
    fill = "Region"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, hjust = 0.5)
  )


4 Correlation Matrix

# Load library
library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)

# Load data
df <- read_csv("Latihan bab 8.csv")
## New names:
## Rows: 500 Columns: 25
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (8): Transaction_ID, Customer_ID, Product_Category, Product_ID, Region... dbl
## (15): ...1, Quantity, Unit_Price, Discount, Delivery_Time, Total_Price,... lgl
## (1): ID_HasPattern date (1): Transaction_Date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Pastikan kolom numerik dan hitung Total_Price
df <- df %>%
  mutate(
    Quantity = as.numeric(Quantity),
    Unit_Price = as.numeric(Unit_Price),
    Total_Price = Quantity * Unit_Price
  )

# Pilih kolom numerik dan hilangkan NA
numeric_df <- df %>%
  select(Quantity, Unit_Price, Total_Price) %>%
  drop_na()

# Hitung korelasi
corr_matrix <- round(cor(numeric_df), 2)

# Ubah ke format long
corr_long <- as.data.frame(as.table(corr_matrix))
colnames(corr_long) <- c("Var1", "Var2", "Correlation")

# Visualisasi ggplot dengan skema warna mirip 'coolwarm'
ggplot(corr_long, aes(x = Var1, y = Var2, fill = Correlation)) +
  geom_tile(color = "white") +
  geom_text(aes(label = sprintf("%.2f", Correlation)), size = 5) +
  scale_fill_gradientn(
    colours = c("#3b4cc0", "#78aadd", "#ffffff", "#f7945d", "#b40426"),
    limits = c(-1, 1),
    name = "Correlation"
  ) +
  theme_minimal() +
  labs(title = "Correlation Matrix", x = NULL, y = NULL) +
  theme(
    axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
    plot.title = element_text(size = 16, face = "bold")
  )


5 Line Chart

library(ggplot2)
library(readr)
library(dplyr)

# 1. Load dataset
data_bisnis <- read_csv("Latihan bab 8.csv")
## New names:
## Rows: 500 Columns: 25
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (8): Transaction_ID, Customer_ID, Product_Category, Product_ID, Region... dbl
## (15): ...1, Quantity, Unit_Price, Discount, Delivery_Time, Total_Price,... lgl
## (1): ID_HasPattern date (1): Transaction_Date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Pastikan Quantity numerik dan hilangkan NA
data_bisnis <- data_bisnis %>%
  mutate(Quantity = as.numeric(Quantity)) %>%
  filter(!is.na(Quantity))

# Jika ada kolom 'Date', buat line chart berdasarkan waktu
if ("Date" %in% names(data_bisnis)) {
  data_bisnis$Date <- as.Date(data_bisnis$Date)

  df_line <- data_bisnis %>%
    group_by(Date, Product_Category) %>%
    summarise(Quantity = sum(Quantity, na.rm = TRUE), .groups = 'drop')

  ggplot(df_line, aes(x = Date, y = Quantity, color = Product_Category)) +
    geom_line() +
    labs(title = "Tren Kuantitas per Kategori Produk",
         x = "Tanggal", y = "Kuantitas") +
    theme_minimal()
} else {
  # Jika tidak ada Date, agregasi berdasarkan kategori
  df_line <- data_bisnis %>%
    group_by(Product_Category) %>%
    summarise(Quantity = sum(Quantity), .groups = 'drop')

  ggplot(df_line, aes(x = Product_Category, y = Quantity, group = 1)) +
    geom_line() +
    geom_point() +
    labs(title = "Total Kuantitas per Kategori Produk",
         x = "Kategori Produk", y = "Kuantitas") +
    theme_minimal()
}


6 Area Chart

# Load library
library(ggplot2)
library(dplyr)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
# =============== #
# 1. Load Dataset #
# =============== #
data_bisnis <- read.csv("Latihan bab 8.csv")

# Pastikan kolom Total_Price adalah numerik dan hilangkan nilai NA
data_bisnis$Total_Price <- as.numeric(data_bisnis$Total_Price)
data_bisnis <- na.omit(data_bisnis)

# Ubah format tanggal
data_bisnis$Transaction_Date <- as.Date(data_bisnis$Transaction_Date)

# Mengelompokkan data berdasarkan bulan dan menjumlahkan Total_Price
data_bisnis$Month <- floor_date(data_bisnis$Transaction_Date, "month")
data_grouped <- data_bisnis %>%
  group_by(Month) %>%
  summarise(Total_Price = sum(Total_Price))

# ========================== #
# 2. Buat Area Chart        #
# ========================== #
ggplot(data_grouped, aes(x = Month, y = Total_Price)) +
  geom_area(fill = "skyblue", alpha = 0.5) +  # Area chart
  geom_line(color = "Slateblue", size = 1.2) +  # Garis di atas area
  labs(title = "Total Price per Bulan", 
       x = "Bulan", 
       y = "Total Price") +
  theme_minimal(base_size = 14) +  # Tema minimal
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),  # Judul tengah dan tebal
        axis.text.x = element_text(angle = 45, hjust = 1)) +  # Memiringkan label sumbu x
  scale_y_continuous(labels = scales::dollar)  # Format sumbu y sebagai dollar
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

