1. Introduction

This report presents data transformations on a weather dataset. The transformations include grouping by location and season, and calculating the average temperature, total rainfall, and the number of observations per group.

# Ensure all required packages are installed
packages <- c("dplyr", "stringi", "lubridate", "DT")
new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load libraries
library(dplyr)
library(stringi)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(DT)
## Warning: package 'DT' was built under R version 4.4.3
# Create a complex dummy weather dataset for data transformation
set.seed(42)
n <- 500

dates <- seq.Date(from = as.Date("2020-01-01"), to = as.Date("2024-12-31"), by = "day")
sample_dates <- sample(dates, n, replace = TRUE)

month <- month(sample_dates)
season <- case_when(
  month %in% c(11, 12, 1, 2) ~ "Rainy Season",
  month %in% c(6, 7, 8, 9) ~ "Dry Season",
  TRUE ~ "Transitional Season"
)

humidity <- round(runif(n, 60, 100), 1)
temperature <- round(rnorm(n, mean = 27, sd = 3), 1)
rainfall <- round(rgamma(n, shape = 2, rate = 0.2), 1)
wind_speed <- round(runif(n, 1, 15), 1)

weather_data <- tibble(
  Observation_ID = stri_rand_strings(n, 12),
  Date = sample_dates,
  Location = sample(c("Jakarta", "Bandung", "Surabaya", "Medan", "Makassar"), n, replace = TRUE),
  Season = season,
  Temperature = temperature,
  Humidity = humidity,
  Rainfall = rainfall,
  Wind_Speed = wind_speed
)

# Show interactive table with download buttons
datatable(
  weather_data,
  extensions = 'Buttons',
  options = list(
    dom = 'Bfrtip',
    buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
    scrollY = "400px",
    scrollCollapse = TRUE,
    paging = FALSE
  ),
  caption = htmltools::tags$caption(
    style = 'caption-side: top; text-align: left; 
             font-size: 18px; font-weight: bold;'
  ),
  class = 'stripe hover compact'
)

2. Data Loading

# Load the dataset
weather_data <- read_excel("6 Data Transformation – Data Science Programming.xlsx", sheet = "Sheet1", skip = 1)
## New names:
## • `` -> `...1`
# Rename columns for clarity
colnames(weather_data) <- c("ID", "Observation_ID", "Date", "Location", "Season", "Temperature", "Humidity", "Rainfall", "Wind_Speed")

# Display first few rows
head(weather_data)
## # A tibble: 6 × 9
##      ID Observation_ID Date                Location Season  Temperature Humidity
##   <dbl> <chr>          <dttm>              <chr>    <chr>         <dbl>    <dbl>
## 1     1 JpeDLWuzIJdl   2021-07-14 00:00:00 Jakarta  Dry Se…        30.7     89.5
## 2     2 EF8r6hXBCqfr   2020-11-16 00:00:00 Bandung  Rainy …        26.2     70.1
## 3     3 cov0TYDwyQoF   2023-03-22 00:00:00 Makassar Transi…        27.5     80.7
## 4     4 aRB0N7xSEnfa   2023-01-02 00:00:00 Surabaya Rainy …        26.8     90.4
## 5     5 fjf9bvNshjHQ   2023-06-05 00:00:00 Medan    Dry Se…        24.9     85.4
## 6     6 jnQGmBA0NZn0   2023-03-15 00:00:00 Jakarta  Transi…        31.1     68.2
## # ℹ 2 more variables: Rainfall <dbl>, Wind_Speed <dbl>

3. Data Transformation

Group by Location

grouped_location <- weather_data %>%
  group_by(Location) %>%
  summarise(
    avg_temperature = mean(Temperature, na.rm = TRUE),
    total_rainfall = sum(Rainfall, na.rm = TRUE),
    observation_count = n()
  )

# Display result
grouped_location
## # A tibble: 5 × 4
##   Location avg_temperature total_rainfall observation_count
##   <chr>              <dbl>          <dbl>             <int>
## 1 Bandung             26.7           976.               101
## 2 Jakarta             27.3           867.                98
## 3 Makassar            26.5          1019.                91
## 4 Medan               27.3           986.               107
## 5 Surabaya            26.8          1056.               103

Group by Season

grouped_season <- weather_data %>%
  group_by(Season) %>%
  summarise(
    avg_temperature = mean(Temperature, na.rm = TRUE),
    total_rainfall = sum(Rainfall, na.rm = TRUE),
    observation_count = n()
  )

# Display result
grouped_season
## # A tibble: 3 × 4
##   Season              avg_temperature total_rainfall observation_count
##   <chr>                         <dbl>          <dbl>             <int>
## 1 Dry Season                     27.0          1760.               177
## 2 Rainy Season                   26.6          1380.               152
## 3 Transitional Season            27.1          1765.               171

4. Visualization

Average Temperature by Location

ggplot(grouped_location, aes(x = Location, y = avg_temperature, fill = Location)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Average Temperature by Location", x = "Location", y = "Average Temperature (°C)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Total Rainfall by Season

ggplot(grouped_season, aes(x = Season, y = total_rainfall, fill = Season)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Total Rainfall by Season", x = "Season", y = "Total Rainfall (mm)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

5. Results and Interpretation

From the grouping by

Location:

From the grouping by

Season:

6. Conclusion

By grouping the weather data, we can better understand the relationship between locations, seasons, and weather parameters such as temperature and rainfall. This type of transformation is useful for identifying weather patterns and making data-driven decisions related to climate studies or planning events.