Practicum 10 — Data Transformation
Practicum Data Transformation
Weather Data Transformation Techniques — Data Science Programming
Group 1 Members
1 Introduction
Practicum Description
- Demonstrate various data transformation techniques on an Indonesian weather dataset.
- Covers: Temporal Transformation, Normalization, Categorical Encoding, Feature Engineering, Outlier Detection, Binning, and Seasonal Signals (Fourier).
- Dataset: 500 weather observations from 5 Indonesian cities (Jakarta, Bandung, Makassar, Surabaya, Medan).
2 Library Import & Data Loading
# ============================================================
# IMPORT LIBRARIES
# ============================================================
library(DT) # interactive tables
library(dplyr) # data manipulation
library(lubridate) # datetime operations
library(htmltools) # HTML helpers for captions
library(plotly) # interactive visualizations
library(scales) # number formatting
# ============================================================
# LOAD DATA
# ============================================================
df_weather <- read.csv("C:/Users/USER/AppData/Local/Temp/Rar$DRa1040.42618.rartemp/Praktikum_week10(kel1)/6_Data-Transformation___Data_Science_Programming.csv",
stringsAsFactors = FALSE)
# Rename columns to be more descriptive (English)
colnames(df_weather) <- c("No", "Observation_ID", "Date", "Location",
"Season", "Temperature", "Humidity", "Rainfall", "Wind_Speed")
# Convert data types
df_weather$Date <- as.Date(df_weather$Date)
df_weather$Temperature <- as.numeric(df_weather$Temperature)
df_weather$Humidity <- as.numeric(df_weather$Humidity)
df_weather$Rainfall <- as.numeric(df_weather$Rainfall)
df_weather$Wind_Speed <- as.numeric(df_weather$Wind_Speed)
tbl_shape <- data.frame(
Description = c("Number of Rows", "Number of Columns"),
Value = c(nrow(df_weather), ncol(df_weather))
)
datatable(tbl_shape, rownames = FALSE, class = "display compact hover",
caption = tags$caption(style = 'caption-side:top; text-align:center; color:#0d1b3e;
font-size:16px; font-weight:700; font-family:Playfair Display,serif; padding-bottom:10px;',
'Dataset Shape'),
options = list(dom = 't', initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color':'#0d1b3e','color':'#e2c97e',
'font-family':'Source Sans 3, sans-serif','letter-spacing':'0.06em'});",
"}"
))
)tbl_types <- data.frame(
Column = names(sapply(df_weather, class)),
Type = unname(sapply(df_weather, class))
)
datatable(tbl_types, rownames = FALSE, class = "display compact hover",
caption = tags$caption(style = 'caption-side:top; text-align:center; color:#0d1b3e;
font-size:16px; font-weight:700; font-family:Playfair Display,serif; padding-bottom:10px;',
'Data Type per Column'),
options = list(dom = 'tp', pageLength = 15, initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color':'#0d1b3e','color':'#e2c97e',
'font-family':'Source Sans 3, sans-serif','letter-spacing':'0.06em'});",
"}"
))
)tbl_date <- data.frame(Sample_Date = as.character(head(df_weather$Date, 5)))
datatable(tbl_date, rownames = FALSE, class = "display compact hover",
caption = tags$caption(style = 'caption-side:top; text-align:center; color:#0d1b3e;
font-size:16px; font-weight:700; font-family:Playfair Display,serif; padding-bottom:10px;',
'Date Format (First 5 Rows)'),
options = list(dom = 't', initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color':'#0d1b3e','color':'#e2c97e',
'font-family':'Source Sans 3, sans-serif','letter-spacing':'0.06em'});",
"}"
))
)tbl_na <- data.frame(
Column = names(colSums(is.na(df_weather))),
NA_Count = unname(colSums(is.na(df_weather)))
)
datatable(tbl_na, rownames = FALSE, class = "display compact hover",
caption = tags$caption(style = 'caption-side:top; text-align:center; color:#0d1b3e;
font-size:16px; font-weight:700; font-family:Playfair Display,serif; padding-bottom:10px;',
'Missing Values per Column'),
options = list(dom = 'tp', pageLength = 15, initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color':'#0d1b3e','color':'#e2c97e',
'font-family':'Source Sans 3, sans-serif','letter-spacing':'0.06em'});",
"}"
))
)2.1 Raw Data Preview
Weather Dataset — 500 Observations
2.2 Descriptive Statistics
Descriptive Statistics
3 Data Cleaning & Temporal Transformation
Data Cleaning Steps
- Impute missing values using the median.
- Extract time components: Year, Month, Day, Day-of-Year.
-
Convert the date column to the
Dateformat.
# ============================================================
# IMPUTE MISSING VALUES WITH MEDIAN
# ============================================================
for (col in cols_num) {
median_val <- median(df_weather[[col]], na.rm = TRUE)
df_weather[[col]][is.na(df_weather[[col]])] <- median_val
}
# ============================================================
# EXTRACT DATE COMPONENTS (Temporal Transformation)
# ============================================================
df_weather$Year <- year(df_weather$Date)
df_weather$Month <- month(df_weather$Date)
df_weather$Day <- day(df_weather$Date)
df_weather$DayOfYear <- yday(df_weather$Date)
df_weather$MonthName <- month(df_weather$Date, label = TRUE, abbr = FALSE)
df_weather$DayName <- weekdays(df_weather$Date)
tbl_cleaning <- data.frame(
Description = c("Missing Values After Cleaning", "Date Components"),
Result = c(sum(is.na(df_weather[, cols_num])),
"Year, Month, Day, DayOfYear, MonthName, DayName — successfully extracted")
)
datatable(tbl_cleaning, rownames = FALSE, class = "display compact hover",
caption = tags$caption(style = 'caption-side:top; text-align:center; color:#0d1b3e;
font-size:16px; font-weight:700; font-family:Playfair Display,serif; padding-bottom:10px;',
'Cleaning Summary'),
options = list(dom = 't', initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color':'#0d1b3e','color':'#e2c97e',
'font-family':'Source Sans 3, sans-serif','letter-spacing':'0.06em'});",
"}"
))
)Data After Cleaning & Date Extraction
4 Feature Engineering
Features Created
- Rolling Average: Weekly (7-day) and monthly (30-day) moving averages for Temperature.
- Lag & Differencing: Previous day’s temperature and daily temperature difference.
- Fourier Cycle Features: Seasonal sin/cos based on day-of-year.
- Heat Index: Heat index derived from temperature and humidity.
-
Season Classification: Based on the
Seasoncolumn in the dataset.
# ============================================================
# 1. ROLLING AVERAGE (Moving Average)
# ============================================================
# Sort data by date before rolling
df_weather <- df_weather[order(df_weather$Date), ]
roll_mean <- function(x, n) {
result <- numeric(length(x))
for (i in seq_along(x)) {
start_idx <- max(1, i - n + 1)
result[i] <- mean(x[start_idx:i], na.rm = TRUE)
}
result
}
df_weather$Temp_MA7 <- roll_mean(df_weather$Temperature, 7)
df_weather$Temp_MA30 <- roll_mean(df_weather$Temperature, 30)
# ============================================================
# 2. LAG & DIFFERENCING
# ============================================================
df_weather$Temp_Lag1 <- c(NA, head(df_weather$Temperature, -1)) # previous day temperature
df_weather$Temp_Diff <- c(NA, diff(df_weather$Temperature)) # daily temperature difference
# ============================================================
# 3. FOURIER CYCLE / SEASONAL FEATURES (sin & cos)
# ============================================================
# sin_year = sin(2π × DayOfYear / 365.25)
# cos_year = cos(2π × DayOfYear / 365.25)
df_weather$Sin_Year <- sin(2 * pi * df_weather$DayOfYear / 365.25)
df_weather$Cos_Year <- cos(2 * pi * df_weather$DayOfYear / 365.25)
# ============================================================
# 4. HEAT INDEX
# ============================================================
# Simple formula: Temperature + 0.55 × (Humidity/100)
df_weather$Heat_Index <- df_weather$Temperature + (0.55 * (df_weather$Humidity / 100))
tbl_fe <- data.frame(
New_Feature = c("Temp_MA7", "Temp_MA30", "Temp_Lag1", "Temp_Diff",
"Sin_Year", "Cos_Year", "Heat_Index"),
Description = c("7-day moving average", "30-day moving average",
"Previous day temperature", "Daily temperature difference",
"Seasonal sine component (Fourier)", "Seasonal cosine component (Fourier)",
"Heat index (Temperature + 0.55 × Humidity/100)")
)
datatable(tbl_fe, rownames = FALSE, class = "display compact hover",
caption = tags$caption(style = 'caption-side:top; text-align:center; color:#0d1b3e;
font-size:16px; font-weight:700; font-family:Playfair Display,serif; padding-bottom:10px;',
'Summary of New Features Created'),
options = list(dom = 't', initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color':'#0d1b3e','color':'#e2c97e',
'font-family':'Source Sans 3, sans-serif','letter-spacing':'0.06em'});",
"}"
))
)Feature Engineering Results
5 Categorization & Binning
Binning & Weather Categorization
- Rainfall Binning: None / Light / Moderate / Heavy.
- Weather Condition: Storm / Cloudy / Hot Sunny / Sunny.
# ============================================================
# RAINFALL BINNING
# ============================================================
df_weather$Rain_Category <- cut(
df_weather$Rainfall,
breaks = c(-Inf, 0, 5, 15, Inf),
labels = c("None", "Light", "Moderate", "Heavy"),
right = TRUE
)
# ============================================================
# WEATHER CONDITION CLASSIFICATION
# ============================================================
df_weather$Weather_Condition <- ifelse(
df_weather$Rainfall > 10, "Storm",
ifelse(df_weather$Rainfall > 2, "Cloudy",
ifelse(df_weather$Temperature > 30, "Hot Sunny", "Sunny")
)
)
tbl_rain_cat <- as.data.frame(table(df_weather$Rain_Category))
colnames(tbl_rain_cat) <- c("Rain_Category", "Frequency")
datatable(tbl_rain_cat, rownames = FALSE, class = "display compact hover",
caption = tags$caption(style = 'caption-side:top; text-align:center; color:#0d1b3e;
font-size:16px; font-weight:700; font-family:Playfair Display,serif; padding-bottom:10px;',
'Rainfall Category Distribution'),
options = list(dom = 't', initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color':'#0d1b3e','color':'#e2c97e',
'font-family':'Source Sans 3, sans-serif','letter-spacing':'0.06em'});",
"}"
))
)tbl_condition <- as.data.frame(table(df_weather$Weather_Condition))
colnames(tbl_condition) <- c("Weather_Condition", "Frequency")
datatable(tbl_condition, rownames = FALSE, class = "display compact hover",
caption = tags$caption(style = 'caption-side:top; text-align:center; color:#0d1b3e;
font-size:16px; font-weight:700; font-family:Playfair Display,serif; padding-bottom:10px;',
'Weather Condition Distribution'),
options = list(dom = 't', initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color':'#0d1b3e','color':'#e2c97e',
'font-family':'Source Sans 3, sans-serif','letter-spacing':'0.06em'});",
"}"
))
)Binning & Categorization Results
6 Outlier Detection & Handling
Outlier Detection Methods
- Z-Score: Flag values where |z| > 3.
- IQR: Lower Bound = Q1 − 1.5 × IQR | Upper Bound = Q3 + 1.5 × IQR.
# ============================================================
# OUTLIER DETECTION — Z-SCORE & IQR
# ============================================================
outlier_results <- data.frame()
for (col in cols_num) {
x <- df_weather[[col]]
z <- (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
outlier_results <- rbind(outlier_results, data.frame(
Column = col,
Outliers_ZScore = sum(abs(z) > 3, na.rm = TRUE),
Outliers_IQR = sum(x < (Q1 - 1.5 * IQR_val) | x > (Q3 + 1.5 * IQR_val), na.rm = TRUE),
Q1 = round(Q1, 2),
Q3 = round(Q3, 2),
IQR = round(IQR_val, 2),
Lower_Bound = round(Q1 - 1.5 * IQR_val, 2),
Upper_Bound = round(Q3 + 1.5 * IQR_val, 2)
))
}
invisible(outlier_results)Outlier Detection Summary
7 Temporal & Rolling Features
Rolling Mean & Lag Features
- Temp_MA7: 7-day moving average.
- Temp_MA30: 30-day moving average.
- Temp_Lag1: Previous day’s temperature.
- Temp_Diff: Daily temperature difference.
Temporal Rolling Features
8 Normalization & Data Scaling
Normalization Methods Applied
- Min-Max Scaling: Maps values to the range [0, 1].
- Z-Score Standardization: Mean = 0, Standard Deviation = 1.
# ============================================================
# MIN-MAX NORMALIZATION & Z-SCORE STANDARDIZATION
# ============================================================
for (col in cols_num) {
x <- df_weather[[col]]
mn <- min(x, na.rm = TRUE)
mx <- max(x, na.rm = TRUE)
# Min-Max: (x - min) / (max - min)
df_weather[[paste0(col, "_minmax")]] <- (x - mn) / (mx - mn)
# Z-Score: (x - mean) / sd
df_weather[[paste0(col, "_zscore")]] <- (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}
tbl_norm_info <- data.frame(
Method = c(rep("Min-Max", length(cols_num)), rep("Z-Score", length(cols_num))),
New_Column = c(paste0(cols_num, "_minmax"), paste0(cols_num, "_zscore"))
)
datatable(tbl_norm_info, rownames = FALSE, class = "display compact hover",
caption = tags$caption(style = 'caption-side:top; text-align:center; color:#0d1b3e;
font-size:16px; font-weight:700; font-family:Playfair Display,serif; padding-bottom:10px;',
'New Columns from Normalization'),
options = list(dom = 't', initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color':'#0d1b3e','color':'#e2c97e',
'font-family':'Source Sans 3, sans-serif','letter-spacing':'0.06em'});",
"}"
))
)Normalization Results
9 Categorical Encoding
Categorical Variable Encoding
-
Label Encoding: Convert the
SeasonandLocationcolumns into numeric values. - One-Hot Encoding: Create 0/1 dummy columns for each season category.
# ============================================================
# LABEL ENCODING
# ============================================================
df_weather$Season_Label <- as.integer(factor(df_weather$Season))
df_weather$Location_Label <- as.integer(factor(df_weather$Location))
# Display encoding maps
season_map <- data.frame(Season = levels(factor(df_weather$Season)),
Label = 1:length(levels(factor(df_weather$Season))))
location_map <- data.frame(Location = levels(factor(df_weather$Location)),
Label = 1:length(levels(factor(df_weather$Location))))
datatable(season_map, rownames = FALSE, class = "display compact hover",
caption = tags$caption(style = 'caption-side:top; text-align:center; color:#0d1b3e;
font-size:16px; font-weight:700; font-family:Playfair Display,serif; padding-bottom:10px;',
'Label Encoding Map — Season'),
options = list(dom = 't', initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color':'#0d1b3e','color':'#e2c97e',
'font-family':'Source Sans 3, sans-serif','letter-spacing':'0.06em'});",
"}"
))
)datatable(location_map, rownames = FALSE, class = "display compact hover",
caption = tags$caption(style = 'caption-side:top; text-align:center; color:#0d1b3e;
font-size:16px; font-weight:700; font-family:Playfair Display,serif; padding-bottom:10px;',
'Label Encoding Map — Location'),
options = list(dom = 't', initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color':'#0d1b3e','color':'#e2c97e',
'font-family':'Source Sans 3, sans-serif','letter-spacing':'0.06em'});",
"}"
))
)# ============================================================
# ONE-HOT ENCODING — Season
# ============================================================
season_levels <- unique(df_weather$Season)
for (lvl in season_levels) {
col_name <- paste0("Season_", gsub(" ", "_", lvl))
df_weather[[col_name]] <- as.integer(df_weather$Season == lvl)
}
tbl_ohe_info <- data.frame(
Dummy_Column = paste0("Season_", gsub(" ", "_", unique(df_weather$Season))),
Description = paste0("Equals 1 if Season = '", unique(df_weather$Season), "', otherwise 0")
)
datatable(tbl_ohe_info, rownames = FALSE, class = "display compact hover",
caption = tags$caption(style = 'caption-side:top; text-align:center; color:#0d1b3e;
font-size:16px; font-weight:700; font-family:Playfair Display,serif; padding-bottom:10px;',
'One-Hot Encoding Columns Created for Season'),
options = list(dom = 't', initComplete = JS(
"function(settings, json) {",
"$(this.api().table().header()).css({'background-color':'#0d1b3e','color':'#e2c97e',
'font-family':'Source Sans 3, sans-serif','letter-spacing':'0.06em'});",
"}"
))
)Categorical Encoding Results
10 Interactive Visualizations
10.1 Temperature Distribution by Location
Interactive Boxplot — Temperature per City
city_colors <- c(
"Jakarta" = "#0d1b3e",
"Bandung" = "#1f3c88",
"Makassar" = "#c9a84c",
"Surabaya" = "#e2c97e",
"Medan" = "#4a90d9"
)
plot_ly(
data = df_weather,
x = ~Location,
y = ~Temperature,
color = ~Location,
colors = city_colors,
type = "box",
boxpoints = "outliers"
) %>%
layout(
title = list(text = "Temperature Distribution per City", font = list(family = "Playfair Display")),
xaxis = list(title = "City"),
yaxis = list(title = "Temperature (°C)"),
showlegend = FALSE,
plot_bgcolor = "#f8f9fc",
paper_bgcolor = "#ffffff"
)Interpretation
The boxplot displays the distribution of daily temperatures across five cities. In general, Makassar and Surabaya tend to have higher median temperatures compared to other cities, reflecting the characteristics of a hotter coastal tropical climate. Bandung shows a lower and narrower temperature distribution, consistent with its geographic location on a highland plateau. The presence of outlier points in some cities indicates days with unusually extreme temperatures, and this finding aligns with the IQR-based outlier detection results from the previous step. The varying IQR widths across cities also suggest that temperature volatility is not geographically uniform.
10.2 Temperature Trend Over Time
Temperature Trend — 30-Day Rolling Mean
plot_ly(df_weather, x = ~Date) %>%
add_lines(y = ~Temperature, name = "Daily Temperature",
line = list(color = "rgba(201,168,76,0.3)", width = 1)) %>%
add_lines(y = ~Temp_MA7, name = "MA-7 Days",
line = list(color = "#1f3c88", width = 2)) %>%
add_lines(y = ~Temp_MA30, name = "MA-30 Days",
line = list(color = "#c9a84c", width = 2.5, dash = "dash")) %>%
layout(
title = list(text = "Daily Temperature Trend + Rolling Average", font = list(family = "Playfair Display")),
xaxis = list(title = "Date"),
yaxis = list(title = "Temperature (°C)"),
legend = list(orientation = "h", y = -0.2),
plot_bgcolor = "#f8f9fc",
paper_bgcolor = "#ffffff"
)Interpretation
The trend chart displays three data layers: raw daily temperature, MA-7, and MA-30. The raw daily temperature appears highly volatile with significant noise, while MA-7 effectively smooths out daily fluctuations to expose weekly patterns. MA-30 provides the cleanest medium-term trend view, and from it a recurring seasonal cycle pattern is visible throughout the observation period. Convergence between MA-7 and MA-30 in certain periods indicates relatively stable temperature phases, while divergence signals seasonal transitions or weather anomalies. This visualization directly validates the utility of the rolling average features built during the feature engineering stage.
10.3 Weather Condition Distribution
Interactive Pie Chart — Weather Conditions
condition_count <- as.data.frame(table(df_weather$Weather_Condition))
colnames(condition_count) <- c("Condition", "Frequency")
plot_ly(
condition_count,
labels = ~Condition,
values = ~Frequency,
type = "pie",
marker = list(colors = c("#0d1b3e", "#1f3c88", "#c9a84c", "#e2c97e"),
line = list(color = "#ffffff", width = 2))
) %>%
layout(
title = list(text = "Weather Condition Distribution", font = list(family = "Playfair Display")),
paper_bgcolor = "#ffffff"
)Interpretation
The pie chart shows the proportion of each weather condition: Sunny, Hot Sunny, Cloudy, and Storm. The dominance of certain conditions — generally Sunny or Cloudy — reflects the characteristics of Indonesia’s tropical climate, which has intense rainfall but not uniformly distributed throughout the year. The Storm condition (Rainfall > 10 mm) occupies the smallest portion, consistent with the expectation that extreme weather is a minority occurrence in the data distribution. This proportion is important as a reference for class imbalance if the dataset is used for weather condition classification modeling.
10.4 Scatter: Temperature vs Rainfall
Interactive Scatter Plot
plot_ly(
data = df_weather,
x = ~Temperature,
y = ~Rainfall,
color = ~Season,
colors = c("#0d1b3e", "#c9a84c", "#4a90d9"),
type = "scatter",
mode = "markers",
text = ~paste0("Location: ", Location, "<br>Date: ", Date,
"<br>Temperature: ", Temperature, "°C<br>Rainfall: ", Rainfall, " mm"),
hoverinfo = "text",
marker = list(size = 7, opacity = 0.7)
) %>%
layout(
title = list(text = "Temperature vs Rainfall by Season", font = list(family = "Playfair Display")),
xaxis = list(title = "Temperature (°C)"),
yaxis = list(title = "Rainfall (mm)"),
legend = list(title = list(text = "Season")),
plot_bgcolor = "#f8f9fc",
paper_bgcolor = "#ffffff"
)Interpretation
The scatter plot reveals the relationship between temperature and
rainfall differentiated by season. In general, a weak negative
pattern is visible — higher temperatures tend to be associated
with lower rainfall, and vice versa. This makes meteorological sense
because during the dry season, temperatures are higher and rainfall is
minimal, while the rainy season brings high rainfall with slightly lower
temperatures. The color separation by season emphasizes that
data clusters form naturally along seasonal lines,
confirming that the Season variable is a significant
separating factor and worth retaining as a feature in predictive
modeling.
11 Final Complete Dataset
Final Dataset — All Transformation Features
Practicum 6 · Data Transformation · Data Science Programming · ITSB · May 2026