This report presents data transformations on a weather dataset. The transformations include grouping by location and season, and calculating the average temperature, total rainfall, and the number of observations per group.
# Ensure all required packages are installed
packages <- c("dplyr", "stringi", "lubridate", "DT")
new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load libraries
library(dplyr)
library(stringi)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(DT)
## Warning: package 'DT' was built under R version 4.4.3
# Create a complex dummy weather dataset for data transformation
set.seed(42)
n <- 500
dates <- seq.Date(from = as.Date("2020-01-01"), to = as.Date("2024-12-31"), by = "day")
sample_dates <- sample(dates, n, replace = TRUE)
month <- month(sample_dates)
season <- case_when(
month %in% c(11, 12, 1, 2) ~ "Rainy Season",
month %in% c(6, 7, 8, 9) ~ "Dry Season",
TRUE ~ "Transitional Season"
)
humidity <- round(runif(n, 60, 100), 1)
temperature <- round(rnorm(n, mean = 27, sd = 3), 1)
rainfall <- round(rgamma(n, shape = 2, rate = 0.2), 1)
wind_speed <- round(runif(n, 1, 15), 1)
weather_data <- tibble(
Observation_ID = stri_rand_strings(n, 12),
Date = sample_dates,
Location = sample(c("Jakarta", "Bandung", "Surabaya", "Medan", "Makassar"), n, replace = TRUE),
Season = season,
Temperature = temperature,
Humidity = humidity,
Rainfall = rainfall,
Wind_Speed = wind_speed
)
# Show interactive table with download buttons
datatable(
weather_data,
extensions = 'Buttons',
options = list(
dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel', 'pdf', 'print'),
scrollY = "400px",
scrollCollapse = TRUE,
paging = FALSE
),
caption = htmltools::tags$caption(
style = 'caption-side: top; text-align: left;
font-size: 18px; font-weight: bold;'
),
class = 'stripe hover compact'
)
# Load the dataset
weather_data <- read_excel("6 Data Transformation – Data Science Programming.xlsx", sheet = "Sheet1", skip = 1)
## New names:
## • `` -> `...1`
# Rename columns for clarity
colnames(weather_data) <- c("ID", "Observation_ID", "Date", "Location", "Season", "Temperature", "Humidity", "Rainfall", "Wind_Speed")
# Display first few rows
head(weather_data)
## # A tibble: 6 × 9
## ID Observation_ID Date Location Season Temperature Humidity
## <dbl> <chr> <dttm> <chr> <chr> <dbl> <dbl>
## 1 1 JpeDLWuzIJdl 2021-07-14 00:00:00 Jakarta Dry Se… 30.7 89.5
## 2 2 EF8r6hXBCqfr 2020-11-16 00:00:00 Bandung Rainy … 26.2 70.1
## 3 3 cov0TYDwyQoF 2023-03-22 00:00:00 Makassar Transi… 27.5 80.7
## 4 4 aRB0N7xSEnfa 2023-01-02 00:00:00 Surabaya Rainy … 26.8 90.4
## 5 5 fjf9bvNshjHQ 2023-06-05 00:00:00 Medan Dry Se… 24.9 85.4
## 6 6 jnQGmBA0NZn0 2023-03-15 00:00:00 Jakarta Transi… 31.1 68.2
## # ℹ 2 more variables: Rainfall <dbl>, Wind_Speed <dbl>
grouped_location <- weather_data %>%
group_by(Location) %>%
summarise(
avg_temperature = mean(Temperature, na.rm = TRUE),
total_rainfall = sum(Rainfall, na.rm = TRUE),
observation_count = n()
)
# Display result
grouped_location
## # A tibble: 5 × 4
## Location avg_temperature total_rainfall observation_count
## <chr> <dbl> <dbl> <int>
## 1 Bandung 26.7 976. 101
## 2 Jakarta 27.3 867. 98
## 3 Makassar 26.5 1019. 91
## 4 Medan 27.3 986. 107
## 5 Surabaya 26.8 1056. 103
grouped_season <- weather_data %>%
group_by(Season) %>%
summarise(
avg_temperature = mean(Temperature, na.rm = TRUE),
total_rainfall = sum(Rainfall, na.rm = TRUE),
observation_count = n()
)
# Display result
grouped_season
## # A tibble: 3 × 4
## Season avg_temperature total_rainfall observation_count
## <chr> <dbl> <dbl> <int>
## 1 Dry Season 27.0 1760. 177
## 2 Rainy Season 26.6 1380. 152
## 3 Transitional Season 27.1 1765. 171
ggplot(grouped_location, aes(x = Location, y = avg_temperature, fill = Location)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Average Temperature by Location", x = "Location", y = "Average Temperature (°C)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(grouped_season, aes(x = Season, y = total_rainfall, fill = Season)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Total Rainfall by Season", x = "Season", y = "Total Rainfall (mm)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
From the grouping by
Location:
From the grouping by
Season:
By grouping the weather data, we can better understand the relationship between locations, seasons, and weather parameters such as temperature and rainfall. This type of transformation is useful for identifying weather patterns and making data-driven decisions related to climate studies or planning events.