library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readr)
Read the csv file and view overall structure
#Read the csv file and view overall structure
employment_wide <- read_csv("employment_stats.csv", skip = 1)
## New names:
## Rows: 34 Columns: 5
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 num (4): Aug. 2024...2, Aug. 2025...3, Aug. 2024...4, Aug. 2025...5
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
## • `Aug. 2024` -> `Aug. 2024...2`
## • `Aug. 2025` -> `Aug. 2025...3`
## • `Aug. 2024` -> `Aug. 2024...4`
## • `Aug. 2025` -> `Aug. 2025...5`
head(employment_wide, 10)
## # A tibble: 10 × 5
## ...1 `Aug. 2024...2` `Aug. 2025...3` `Aug. 2024...4` `Aug. 2025...5`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 <NA> NA NA NA NA
## 2 Civilian non… 33649 35129 235207 238872
## 3 Civilian lab… 8030 8809 160733 162226
## 4 Participatio… 23.9 25.1 68.3 67.9
## 5 Employed 7362 8052 153987 155236
## 6 Employment-p… 21.9 22.9 65.5 65
## 7 Unemployed 669 757 6746 6990
## 8 Unemployment… 8.3 8.6 4.2 4.3
## 9 Not in labor… 25619 26321 74474 76646
## 10 Men, 16 to 6… NA NA NA NA
glimpse(employment_wide)
## Rows: 34
## Columns: 5
## $ ...1 <chr> NA, "Civilian noninstitutional population", "Civilian …
## $ `Aug. 2024...2` <dbl> NA, 33649.0, 8030.0, 23.9, 7362.0, 21.9, 669.0, 8.3, 2…
## $ `Aug. 2025...3` <dbl> NA, 35129.0, 8809.0, 25.1, 8052.0, 22.9, 757.0, 8.6, 2…
## $ `Aug. 2024...4` <dbl> NA, 235207.0, 160733.0, 68.3, 153987.0, 65.5, 6746.0, …
## $ `Aug. 2025...5` <dbl> NA, 238872.0, 162226.0, 67.9, 155236.0, 65.0, 6990.0, …
str(employment_wide)
## spc_tbl_ [34 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ...1 : chr [1:34] NA "Civilian noninstitutional population" "Civilian labor force" "Participation rate" ...
## $ Aug. 2024...2: num [1:34] NA 33649 8030 23.9 7362 ...
## $ Aug. 2025...3: num [1:34] NA 35129 8809 25.1 8052 ...
## $ Aug. 2024...4: num [1:34] NA 235207 160733 68.3 153987 ...
## $ Aug. 2025...5: num [1:34] NA 238872 162226 67.9 155236 ...
## - attr(*, "spec")=
## .. cols(
## .. ...1 = col_character(),
## .. `Aug. 2024...2` = col_number(),
## .. `Aug. 2025...3` = col_number(),
## .. `Aug. 2024...4` = col_number(),
## .. `Aug. 2025...5` = col_number()
## .. )
## - attr(*, "problems")=<externalptr>
names(employment_wide)
## [1] "...1" "Aug. 2024...2" "Aug. 2025...3" "Aug. 2024...4"
## [5] "Aug. 2025...5"
tail(employment_wide, 5)
## # A tibble: 5 × 5
## ...1 `Aug. 2024...2` `Aug. 2025...3` `Aug. 2024...4` `Aug. 2025...5`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Employment-po… 7.6 7.8 23.7 22.9
## 2 Unemployed 60 113 330 386
## 3 Unemployment … 4.5 7.4 3.1 3.7
## 4 Not in labor … 15607 16343 32479 33622
## 5 NOTE: A perso… NA NA NA NA
Inspect the dataset
data_types <- sapply(employment_wide, class)
print(data_types)
## ...1 Aug. 2024...2 Aug. 2025...3 Aug. 2024...4 Aug. 2025...5
## "character" "numeric" "numeric" "numeric" "numeric"
numeric_cols <- names(employment_wide)[sapply(employment_wide, is.numeric)]
print(numeric_cols)
## [1] "Aug. 2024...2" "Aug. 2025...3" "Aug. 2024...4" "Aug. 2025...5"
character_cols <- names(employment_wide)[sapply(employment_wide, is.character)]
print(character_cols)
## [1] "...1"
cat("Duplicate rows:", sum(duplicated(employment_wide)))
## Duplicate rows: 0
empty_rows <- employment_wide %>%
filter(if_all(everything(), is.na))
cat("Completely empty rows:", nrow(empty_rows), "\n")
## Completely empty rows: 1
The goal of this transformation is to convert the dataset from wide format to long (tidy) format. In the original wide format, column headers contain data values rather than variable names, violating tidy data principles. The transformed dataset will have five columns: Category, Disability_Status, Year, Month, and Value.
Transformation Approach My approach follows these steps:
# Rename columns in the wide datasetto be more descriptive
employment_wide <- employment_wide %>%
rename(
Category = 1, # First column
Disability_Aug2024 = 2, # People with disability, Aug 2024
Disability_Aug2025 = 3, # People with disability, Aug 2025
NoDisability_Aug2024 = 4, # People with no disability, Aug 2024
NoDisability_Aug2025 = 5 # People with no disability, Aug 2025
) %>%
filter(!is.na(Category)) # Remove empty rows
# View cleaned wide format
print("\nCleaned wide format:")
## [1] "\nCleaned wide format:"
print(head(employment_wide, 10))
## # A tibble: 10 × 5
## Category Disability_Aug2024 Disability_Aug2025 NoDisability_Aug2024
## <chr> <dbl> <dbl> <dbl>
## 1 Civilian noninsti… 33649 35129 235207
## 2 Civilian labor fo… 8030 8809 160733
## 3 Participation rate 23.9 25.1 68.3
## 4 Employed 7362 8052 153987
## 5 Employment-popula… 21.9 22.9 65.5
## 6 Unemployed 669 757 6746
## 7 Unemployment rate 8.3 8.6 4.2
## 8 Not in labor force 25619 26321 74474
## 9 Men, 16 to 64 yea… NA NA NA
## 10 Civilian labor fo… 3377 3837 79333
## # ℹ 1 more variable: NoDisability_Aug2025 <dbl>
Check for empty rows and remove
empty_rows <- employment_wide %>%
filter(if_all(everything(), is.na))
cat("Completely empty rows:", nrow(empty_rows), "\n")
## Completely empty rows: 0
library(tidyverse)
print(head(employment_wide, 15))
## # A tibble: 15 × 5
## Category Disability_Aug2024 Disability_Aug2025 NoDisability_Aug2024
## <chr> <dbl> <dbl> <dbl>
## 1 Civilian noninsti… 33649 35129 235207
## 2 Civilian labor fo… 8030 8809 160733
## 3 Participation rate 23.9 25.1 68.3
## 4 Employed 7362 8052 153987
## 5 Employment-popula… 21.9 22.9 65.5
## 6 Unemployed 669 757 6746
## 7 Unemployment rate 8.3 8.6 4.2
## 8 Not in labor force 25619 26321 74474
## 9 Men, 16 to 64 yea… NA NA NA
## 10 Civilian labor fo… 3377 3837 79333
## 11 Participation rate 41.4 44.3 83
## 12 Employed 3065 3496 76097
## 13 Employment-popula… 37.6 40.4 79.6
## 14 Unemployed 311 341 3237
## 15 Unemployment rate 9.2 8.9 4.1
## # ℹ 1 more variable: NoDisability_Aug2025 <dbl>
# Step 1: Pivot to long
employment_long <- employment_wide %>%
pivot_longer(
cols = -Category,
names_to = "Group_Year",
values_to = "Value"
)
print(head(employment_long, 20))
## # A tibble: 20 × 3
## Category Group_Year Value
## <chr> <chr> <dbl>
## 1 Civilian noninstitutional population Disability_Aug2024 33649
## 2 Civilian noninstitutional population Disability_Aug2025 35129
## 3 Civilian noninstitutional population NoDisability_Aug2024 235207
## 4 Civilian noninstitutional population NoDisability_Aug2025 238872
## 5 Civilian labor force Disability_Aug2024 8030
## 6 Civilian labor force Disability_Aug2025 8809
## 7 Civilian labor force NoDisability_Aug2024 160733
## 8 Civilian labor force NoDisability_Aug2025 162226
## 9 Participation rate Disability_Aug2024 23.9
## 10 Participation rate Disability_Aug2025 25.1
## 11 Participation rate NoDisability_Aug2024 68.3
## 12 Participation rate NoDisability_Aug2025 67.9
## 13 Employed Disability_Aug2024 7362
## 14 Employed Disability_Aug2025 8052
## 15 Employed NoDisability_Aug2024 153987
## 16 Employed NoDisability_Aug2025 155236
## 17 Employment-population ratio Disability_Aug2024 21.9
## 18 Employment-population ratio Disability_Aug2025 22.9
## 19 Employment-population ratio NoDisability_Aug2024 65.5
## 20 Employment-population ratio NoDisability_Aug2025 65
Continue separating and cleaning the data
# Separate and clean
employment_tidy <- employment_long %>%
separate(
Group_Year,
into = c("Disability_Status", "Month_Year"),
sep = "_"
) %>%
mutate(
# Clean disability status
Disability_Status = case_when(
Disability_Status == "Disability" ~ "With Disability",
Disability_Status == "NoDisability" ~ "No Disability",
TRUE ~ Disability_Status
),
# Extract year and month
Year = str_extract(Month_Year, "\\d{4}"),
Month = str_remove(str_extract(Month_Year, "[A-Za-z]+\\.?"), "\\."),
# Convert value to numeric (handle commas)
Value = case_when(
is.character(Value) ~ as.numeric(gsub(",", "", Value)),
TRUE ~ as.numeric(Value)
)
) %>%
select(Category, Disability_Status, Year, Month, Value)
print("=== FINAL TIDY FORMAT ===")
## [1] "=== FINAL TIDY FORMAT ==="
print(head(employment_tidy, 30))
## # A tibble: 30 × 5
## Category Disability_Status Year Month Value
## <chr> <chr> <chr> <chr> <dbl>
## 1 Civilian noninstitutional population With Disability 2024 Aug 33649
## 2 Civilian noninstitutional population With Disability 2025 Aug 35129
## 3 Civilian noninstitutional population No Disability 2024 Aug 235207
## 4 Civilian noninstitutional population No Disability 2025 Aug 238872
## 5 Civilian labor force With Disability 2024 Aug 8030
## 6 Civilian labor force With Disability 2025 Aug 8809
## 7 Civilian labor force No Disability 2024 Aug 160733
## 8 Civilian labor force No Disability 2025 Aug 162226
## 9 Participation rate With Disability 2024 Aug 23.9
## 10 Participation rate With Disability 2025 Aug 25.1
## # ℹ 20 more rows
Check transformation
# Show all unique categories
print(unique(employment_tidy$Category))
## [1] "Civilian noninstitutional population"
## [2] "Civilian labor force"
## [3] "Participation rate"
## [4] "Employed"
## [5] "Employment-population ratio"
## [6] "Unemployed"
## [7] "Unemployment rate"
## [8] "Not in labor force"
## [9] "Men, 16 to 64 years"
## [10] "Women, 16 to 64 years"
## [11] "Both sexes, 65 years and over"
## [12] "NOTE: A person with a disability has at least one of the following conditions: is deaf or has serious difficulty hearing; is blind or has serious difficulty seeing even when wearing glasses; has serious difficulty concentrating, remembering, or making decisions because of a physical, mental, or emotional condition; has serious difficulty walking or climbing stairs; has difficulty dressing or bathing; or has difficulty doing errands alone such as visiting a doctor's office or shopping because of a physical, mental, or emotional condition. Updated population controls are introduced annually with the release of January data."
# Overall employment comparison - people with and without disabilities
overall_employment <- employment_tidy %>%
filter(Category == "Employed") %>%
pivot_wider(names_from = Disability_Status, values_from = Value)
## Warning: Values from `Value` are not uniquely identified; output will contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
## {data} |>
## dplyr::summarise(n = dplyr::n(), .by = c(Category, Year, Month,
## Disability_Status)) |>
## dplyr::filter(n > 1L)
print(overall_employment)
## # A tibble: 2 × 5
## Category Year Month `With Disability` `No Disability`
## <chr> <chr> <chr> <list> <list>
## 1 Employed 2024 Aug <dbl [4]> <dbl [4]>
## 2 Employed 2025 Aug <dbl [4]> <dbl [4]>
# Participation rates - percentage of the working-age population that is either employed or actively looking for work.
participation <- employment_tidy %>%
filter(Category == "Participation rate") %>%
pivot_wider(names_from = c(Disability_Status, Year), values_from = Value)
## Warning: Values from `Value` are not uniquely identified; output will contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
## {data} |>
## dplyr::summarise(n = dplyr::n(), .by = c(Category, Month, Disability_Status,
## Year)) |>
## dplyr::filter(n > 1L)
print(participation)
## # A tibble: 1 × 6
## Category Month `With Disability_2024` `With Disability_2025`
## <chr> <chr> <list> <list>
## 1 Participation rate Aug <dbl [4]> <dbl [4]>
## # ℹ 2 more variables: `No Disability_2024` <list>, `No Disability_2025` <list>
# Example 3: By demographic group
demographics <- employment_tidy %>%
filter(Category %in% c("Men, 16 to 64 years", "Women, 16 to 64 years",
"Both sexes, 65 years and over"))
print(head(demographics, 12))
## # A tibble: 12 × 5
## Category Disability_Status Year Month Value
## <chr> <chr> <chr> <chr> <dbl>
## 1 Men, 16 to 64 years With Disability 2024 Aug NA
## 2 Men, 16 to 64 years With Disability 2025 Aug NA
## 3 Men, 16 to 64 years No Disability 2024 Aug NA
## 4 Men, 16 to 64 years No Disability 2025 Aug NA
## 5 Women, 16 to 64 years With Disability 2024 Aug NA
## 6 Women, 16 to 64 years With Disability 2025 Aug NA
## 7 Women, 16 to 64 years No Disability 2024 Aug NA
## 8 Women, 16 to 64 years No Disability 2025 Aug NA
## 9 Both sexes, 65 years and over With Disability 2024 Aug NA
## 10 Both sexes, 65 years and over With Disability 2025 Aug NA
## 11 Both sexes, 65 years and over No Disability 2024 Aug NA
## 12 Both sexes, 65 years and over No Disability 2025 Aug NA
Save tidy format in a csv
write_csv(employment_tidy, "employment_status_tidy.csv")
cat("\nTidy data saved to: employment_status_tidy.csv\n")
##
## Tidy data saved to: employment_status_tidy.csv
# Load the tidy data
employment_tidy <- read_csv("employment_status_tidy.csv")
## Rows: 132 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Category, Disability_Status, Month
## dbl (2): Year, Value
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Filter 2025 data for people with disabilities
employment_tidy %>%
filter(Year == "2025", Disability_Status == "With Disability")
## # A tibble: 33 × 5
## Category Disability_Status Year Month Value
## <chr> <chr> <dbl> <chr> <dbl>
## 1 Civilian noninstitutional population With Disability 2025 Aug 35129
## 2 Civilian labor force With Disability 2025 Aug 8809
## 3 Participation rate With Disability 2025 Aug 25.1
## 4 Employed With Disability 2025 Aug 8052
## 5 Employment-population ratio With Disability 2025 Aug 22.9
## 6 Unemployed With Disability 2025 Aug 757
## 7 Unemployment rate With Disability 2025 Aug 8.6
## 8 Not in labor force With Disability 2025 Aug 26321
## 9 Men, 16 to 64 years With Disability 2025 Aug NA
## 10 Civilian labor force With Disability 2025 Aug 3837
## # ℹ 23 more rows
# Example: Calculate participation rate changes
employment_tidy %>%
filter(Category == "Participation rate") %>%
group_by(Disability_Status) %>%
arrange(Year) %>%
mutate(Change = Value - lag(Value))
## # A tibble: 16 × 6
## # Groups: Disability_Status [2]
## Category Disability_Status Year Month Value Change
## <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 Participation rate With Disability 2024 Aug 23.9 NA
## 2 Participation rate No Disability 2024 Aug 68.3 NA
## 3 Participation rate With Disability 2024 Aug 41.4 17.5
## 4 Participation rate No Disability 2024 Aug 83 14.7
## 5 Participation rate With Disability 2024 Aug 38.7 -2.70
## 6 Participation rate No Disability 2024 Aug 73.3 -9.7
## 7 Participation rate With Disability 2024 Aug 7.9 -30.8
## 8 Participation rate No Disability 2024 Aug 24.5 -48.8
## 9 Participation rate With Disability 2025 Aug 25.1 17.2
## 10 Participation rate No Disability 2025 Aug 67.9 43.4
## 11 Participation rate With Disability 2025 Aug 44.3 19.2
## 12 Participation rate No Disability 2025 Aug 83.2 15.3
## 13 Participation rate With Disability 2025 Aug 40.1 -4.20
## 14 Participation rate No Disability 2025 Aug 72.6 -10.6
## 15 Participation rate With Disability 2025 Aug 8.5 -31.6
## 16 Participation rate No Disability 2025 Aug 23.8 -48.8
library(tidyverse)
# Read the Airbnb dataset
df_airbnb <- read_csv('AB_NYC_2019.csv')
## Rows: 48895 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): name, host_name, neighbourhood_group, neighbourhood, room_type, la...
## dbl (10): id, host_id, latitude, longitude, price, minimum_nights, number_of...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cat("Original Airbnb Dataset Shape:", dim(df_airbnb)[1], "rows x", dim(df_airbnb)[2], "columns\n")
## Original Airbnb Dataset Shape: 48895 rows x 16 columns
cat("\nFirst 3 rows of original data:\n")
##
## First 3 rows of original data:
print(head(df_airbnb, 3))
## # A tibble: 3 × 16
## id name host_id host_name neighbourhood_group neighbourhood latitude
## <dbl> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 2539 Clean & qu… 2787 John Brooklyn Kensington 40.6
## 2 2595 Skylit Mid… 2845 Jennifer Manhattan Midtown 40.8
## 3 3647 THE VILLAG… 4632 Elisabeth Manhattan Harlem 40.8
## # ℹ 9 more variables: longitude <dbl>, room_type <chr>, price <dbl>,
## # minimum_nights <dbl>, number_of_reviews <dbl>, last_review <chr>,
## # reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
## # availability_365 <dbl>
# Create long format using pivot_longer()
# Keep identifier columns, pivot measurement columns
airbnb_long <- df_airbnb %>%
pivot_longer(
cols = c(price, minimum_nights, number_of_reviews,
reviews_per_month, calculated_host_listings_count, availability_365),
names_to = "metric_type",
values_to = "metric_value"
)
# Clean and tidy the data
airbnb_long_tidy <- airbnb_long %>%
drop_na(metric_value) %>% # Remove null values
arrange(id, metric_type) # Sort by id and metric_type
print(head(airbnb_long_tidy, 12))
## # A tibble: 12 × 12
## id name host_id host_name neighbourhood_group neighbourhood latitude
## <dbl> <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 2539 Clean & q… 2787 John Brooklyn Kensington 40.6
## 2 2539 Clean & q… 2787 John Brooklyn Kensington 40.6
## 3 2539 Clean & q… 2787 John Brooklyn Kensington 40.6
## 4 2539 Clean & q… 2787 John Brooklyn Kensington 40.6
## 5 2539 Clean & q… 2787 John Brooklyn Kensington 40.6
## 6 2539 Clean & q… 2787 John Brooklyn Kensington 40.6
## 7 2595 Skylit Mi… 2845 Jennifer Manhattan Midtown 40.8
## 8 2595 Skylit Mi… 2845 Jennifer Manhattan Midtown 40.8
## 9 2595 Skylit Mi… 2845 Jennifer Manhattan Midtown 40.8
## 10 2595 Skylit Mi… 2845 Jennifer Manhattan Midtown 40.8
## 11 2595 Skylit Mi… 2845 Jennifer Manhattan Midtown 40.8
## 12 2595 Skylit Mi… 2845 Jennifer Manhattan Midtown 40.8
## # ℹ 5 more variables: longitude <dbl>, room_type <chr>, last_review <chr>,
## # metric_type <chr>, metric_value <dbl>
# Save tidied long format
write_csv(airbnb_long_tidy, 'airbnb_long_tidy.csv')
cat("1. AIRBNB DATASET:\n")
## 1. AIRBNB DATASET:
cat(" Wide: ", nrow(df_airbnb), "rows ×", ncol(df_airbnb), "columns\n")
## Wide: 48895 rows × 16 columns
cat(" Long: ", nrow(airbnb_long_tidy), "rows ×", ncol(airbnb_long_tidy), "columns\n")
## Long: 283318 rows × 12 columns
cat(" Output: airbnb_long_tidy.csv\n\n")
## Output: airbnb_long_tidy.csv
The Airbnb dataset originally had 48,895 rows (one per listing) with
measurements spread across 16 columns. Using
pivot_longer()
, we transformed 6 measurement columns
(price, minimum_nights, number_of_reviews, etc.) into long format by
creating two new columns: “metric_type” (what we’re measuring) and
“metric_value” (the actual number). Each listing that was 1 row became 6
rows - one per metric - with identifier information repeated across
rows. The dataset grew from ~49,000 to ~290,000 rows, creating a
cleaner, more analyzable structure where each row represents a single
measurement.
# Read the insurance dataset
df_insurance <- read_csv('insurance.csv')
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print(head(df_insurance, 10))
## # A tibble: 10 × 7
## age sex bmi children smoker region charges
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 19 female 27.9 0 yes southwest 16885.
## 2 18 male 33.8 1 no southeast 1726.
## 3 28 male 33 3 no southeast 4449.
## 4 33 male 22.7 0 no northwest 21984.
## 5 32 male 28.9 0 no northwest 3867.
## 6 31 female 25.7 0 no southeast 3757.
## 7 46 female 33.4 1 no southeast 8241.
## 8 37 female 27.7 3 no northwest 7282.
## 9 37 male 29.8 2 no northeast 6406.
## 10 60 female 25.8 0 no northwest 28923.
cat("Original Shape:", nrow(df_insurance), "rows x", ncol(df_insurance), "columns\n")
## Original Shape: 1338 rows x 7 columns
# Create long format using pivot_longer()
insurance_long_tidy <- df_insurance %>%
pivot_longer(
cols = c(age, bmi, children, charges), # Numeric measurements
names_to = "measurement_type",
values_to = "measurement_value"
) %>%
drop_na(measurement_value) %>% # Remove nulls
arrange(sex, region, measurement_type) # Sort logically
cat("Tidy Long Format:", nrow(insurance_long_tidy), "rows x", ncol(insurance_long_tidy), "columns\n")
## Tidy Long Format: 5352 rows x 5 columns
print(head(insurance_long_tidy, 16))
## # A tibble: 16 × 5
## sex smoker region measurement_type measurement_value
## <chr> <chr> <chr> <chr> <dbl>
## 1 female no northeast age 52
## 2 female no northeast age 60
## 3 female yes northeast age 34
## 4 female no northeast age 63
## 5 female no northeast age 18
## 6 female no northeast age 24
## 7 female no northeast age 18
## 8 female no northeast age 18
## 9 female no northeast age 58
## 10 female no northeast age 22
## 11 female no northeast age 45
## 12 female no northeast age 18
## 13 female no northeast age 26
## 14 female no northeast age 59
## 15 female no northeast age 61
## 16 female no northeast age 20
The insurance dataset originally had 1,338 rows (one per person) with
7 columns including demographic information (sex, smoker status, region)
and 4 numeric measurements (age, bmi, children, charges). Using
pivot_longer()
, we transformed these 4 measurement columns
into long format by creating two new columns: “measurement_type” (what
we’re measuring) and “measurement_value” (the actual number). Each
person that was 1 row became 4 rows - one per measurement - with
demographic identifiers (sex, smoker, region) repeated across rows. The
dataset grew from 1,338 to approximately 5,352 rows, creating a tidy
structure where each row represents a single measurement for a single
person, making it easier to filter, visualize, and analyze specific
measurements across different demographic groups.