Project 2 - Data Transformation

library(tidyverse)
library(knitr)
theme_set(theme_minimal(base_size = 12))
knitr::opts_chunk$set(echo = TRUE)

# Helper: load CSVs or fall back to sample data
load_or_fallback <- function(path, fallback_tbl) {
  if (!file.exists(path)) return(fallback_tbl)
  x <- try(readr::read_csv(path, na = c("", "NA"), show_col_types = FALSE), silent = TRUE)
  if (inherits(x, "try-error") || is.null(x) || ncol(x) <= 1 || nrow(x) == 0) return(fallback_tbl)
  names(x) <- stringr::str_trim(names(x))
  names(x)[1] <- sub("^\ufeff", "", names(x)[1])
  x
}

# Sample fallback data
sales_fallback <- tribble(
  ~Product,   ~Jan, ~Feb, ~Mar, ~Apr, ~May, ~Jun,
  "Widget A", 120, 135, 160, NA, 210, 225,
  "Widget B", 90, 100, 110, 130, 140, NA,
  "Widget C", 60, 75, 85, 95, 110, 120
)

scores_fallback <- tribble(
  ~Student, ~Math, ~Reading, ~Science,
  "Alice", 88, 92, 85,
  "Ben", 76, 81, 79,
  "Cara", 91, 89, 94,
  "Diego", 67, NA, 72
)

vacc_fallback <- tribble(
  ~Region, ~Jan, ~Feb, ~Mar, ~Apr, ~May, ~Jun,
  "North", 10, 25, 45, 60, 70, 80,
  "South", 15, 30, 50, 65, 78, 85,
  "East", 8, 20, 38, 55, 68, 76,
  "West", 12, 28, 44, 58, 72, 82
)

# Load data
sales_wide <- load_or_fallback("sales_wide.csv", sales_fallback)
scores_wide <- load_or_fallback("scores_wide.csv", scores_fallback)
vaccinations_wide <- load_or_fallback("vaccinations_wide.csv", vacc_fallback)

list(
  Sales = head(sales_wide),
  Scores = head(scores_wide),
  Vaccinations = head(vaccinations_wide)
)

## $Sales
## # A tibble: 3 × 7
##   Product    Jan   Feb   Mar   Apr   May   Jun
##   <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Widget A   120   135   160    NA   210   225
## 2 Widget B    90   100   110   130   140    NA
## 3 Widget C    60    75    85    95   110   120
## 
## $Scores
## # A tibble: 4 × 4
##   Student  Math Reading Science
##   <chr>   <dbl>   <dbl>   <dbl>
## 1 Alice      88      92      85
## 2 Ben        76      81      79
## 3 Cara       91      89      94
## 4 Diego      67      NA      72
## 
## $Vaccinations
## # A tibble: 4 × 7
##   Region   Jan   Feb   Mar   Apr   May   Jun
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 North     10    25    45    60    70    80
## 2 South     15    30    50    65    78    85
## 3 East       8    20    38    55    68    76
## 4 West      12    28    44    58    72    82

# Reshape wide -> long using pivot_longer()
sales_long <- sales_wide %>%
  pivot_longer(-Product, names_to = "Month", values_to = "Sales") %>%
  mutate(Sales = replace_na(Sales, 0))

scores_long <- scores_wide %>%
  pivot_longer(-Student, names_to = "Subject", values_to = "Score") %>%
  mutate(Score = replace_na(Score, 0))

vaccinations_long <- vaccinations_wide %>%
  pivot_longer(-Region, names_to = "Month", values_to = "Vaccinated") %>%
  mutate(Vaccinated = replace_na(Vaccinated, 0))

list(
  Sales_long = head(sales_long),
  Scores_long = head(scores_long),
  Vaccinations_long = head(vaccinations_long)
)

## $Sales_long
## # A tibble: 6 × 3
##   Product  Month Sales
##   <chr>    <chr> <dbl>
## 1 Widget A Jan     120
## 2 Widget A Feb     135
## 3 Widget A Mar     160
## 4 Widget A Apr       0
## 5 Widget A May     210
## 6 Widget A Jun     225
## 
## $Scores_long
## # A tibble: 6 × 3
##   Student Subject Score
##   <chr>   <chr>   <dbl>
## 1 Alice   Math       88
## 2 Alice   Reading    92
## 3 Alice   Science    85
## 4 Ben     Math       76
## 5 Ben     Reading    81
## 6 Ben     Science    79
## 
## $Vaccinations_long
## # A tibble: 6 × 3
##   Region Month Vaccinated
##   <chr>  <chr>      <dbl>
## 1 North  Jan           10
## 2 North  Feb           25
## 3 North  Mar           45
## 4 North  Apr           60
## 5 North  May           70
## 6 North  Jun           80

Analysis & Summary

These summaries and simple visuals confirm the reshaped (long) tables are usable for analysis and show a few quick takeaways for each dataset.

# -----------------------------
# 4A) SALES — summary + plot
# -----------------------------
# Order months if present
month_levels <- c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
if ("Month" %in% names(sales_long) && all(unique(sales_long$Month) %in% month_levels)) {
  sales_long <- sales_long %>% mutate(Month = factor(Month, levels = month_levels))
}

sales_summary <- sales_long %>%
  group_by(Month) %>%
  summarise(Total_Sales = sum(Sales, na.rm = TRUE), .groups = "drop")

knitr::kable(sales_summary, caption = "Sales: Total by Month", digits = 1)

Sales: Total by Month
Month	Total_Sales
Jan	270
Feb	310
Mar	355
Apr	225
May	460
Jun	345

ggplot(sales_summary, aes(x = Month, y = Total_Sales)) +
  geom_col() +
  labs(title = "Sales – Total by Month",
       x = "Month", y = "Total Sales")

# -----------------------------
# 4B) SCORES — summary + plot
# -----------------------------
scores_summary <- scores_long %>%
  group_by(Subject) %>%
  summarise(Avg_Score = mean(Score, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(Avg_Score))

knitr::kable(scores_summary, caption = "Scores: Average by Subject", digits = 1)

Scores: Average by Subject
Subject	Avg_Score
Science	82.5
Math	80.5
Reading	65.5

ggplot(scores_summary, aes(x = reorder(Subject, Avg_Score), y = Avg_Score)) +
  geom_col() +
  coord_flip() +
  labs(title = "Scores – Average by Subject",
       x = "Subject", y = "Average Score")

# -------------------------------------
# 4C) VACCINATIONS — summary + plot
# -------------------------------------
if ("Month" %in% names(vaccinations_long) && all(unique(vaccinations_long$Month) %in% month_levels)) {
  vaccinations_long <- vaccinations_long %>% mutate(Month = factor(Month, levels = month_levels))
}

vacc_summary <- vaccinations_long %>%
  group_by(Month) %>%
  summarise(Total_Vaccinated = sum(Vaccinated, na.rm = TRUE), .groups = "drop")

knitr::kable(vacc_summary, caption = "Vaccinations: Total by Month", digits = 0)

Vaccinations: Total by Month
Month	Total_Vaccinated
Jan	45
Feb	103
Mar	177
Apr	238
May	288
Jun	323

ggplot(vacc_summary, aes(x = Month, y = Total_Vaccinated, group = 1)) +
  geom_line() +
  geom_point() +
  labs(title = "Vaccinations – Total by Month",
       x = "Month", y = "Total Vaccinated")

# Create a 'outputs' folder if it doesn't exist
if (!dir.exists("outputs")) dir.create("outputs")

# 5A) Write tidy tables
readr::write_csv(sales_long,        "outputs/sales_long.csv")
readr::write_csv(scores_long,       "outputs/scores_long.csv")
readr::write_csv(vaccinations_long, "outputs/vaccinations_long.csv")

# 5B) (Optional) Also write summaries used in Step 4
if (exists("sales_summary")) readr::write_csv(sales_summary, "outputs/sales_summary_by_month.csv")
if (exists("scores_summary")) readr::write_csv(scores_summary, "outputs/scores_avg_by_subject.csv")
if (exists("vacc_summary"))  readr::write_csv(vacc_summary,  "outputs/vaccinations_total_by_month.csv")

# 5C) Show what we saved
list.files("outputs", pattern = "\\.csv$", full.names = TRUE)

## [1] "outputs/sales_long.csv"                 
## [2] "outputs/sales_summary_by_month.csv"     
## [3] "outputs/scores_avg_by_subject.csv"      
## [4] "outputs/scores_long.csv"                
## [5] "outputs/vaccinations_long.csv"          
## [6] "outputs/vaccinations_total_by_month.csv"

# Create README and zip outputs for easy sharing
readme <- c(
  "Project 2 – Outputs",
  "",
  "Tidy datasets created from wide tables:",
  "- sales_long.csv: Product–Month–Sales",
  "- scores_long.csv: Student–Subject–Score",
  "- vaccinations_long.csv: Region–Month–Vaccinated",
  "",
  "Summaries are also included in CSV format."
)
if (!dir.exists("outputs")) dir.create("outputs")
writeLines(readme, "outputs/README.txt")

# Zip all outputs (uses base utils::zip)
utils::zip(zipfile = "project2_outputs.zip", files = list.files("outputs", full.names = TRUE))

# Show what we produced
list(
  Outputs = list.files("outputs", full.names = TRUE),
  Zip = normalizePath("project2_outputs.zip", mustWork = FALSE)
)

## $Outputs
## [1] "outputs/README.txt"                     
## [2] "outputs/sales_long.csv"                 
## [3] "outputs/sales_summary_by_month.csv"     
## [4] "outputs/scores_avg_by_subject.csv"      
## [5] "outputs/scores_long.csv"                
## [6] "outputs/sessionInfo.txt"                
## [7] "outputs/vaccinations_long.csv"          
## [8] "outputs/vaccinations_total_by_month.csv"
## 
## $Zip
## [1] "/Users/kidd/Desktop/CUNY/CUNY_SPS/2025_2_Fall/DATA_607/Projects/Project_2/project2_outputs.zip"

Reflection

This project taught me a lot more than just how to reshape data — it helped me understand the process of problem-solving in R step by step. At first, I was honestly confused about how the CSV files worked and why they weren’t showing up in RStudio, but once I slowed down and followed the structure carefully, it started to make sense. The part that clicked for me was realizing that wide data is just messy information spread across columns, and the goal is to make it tidy so you can actually work with it.

The process of learning pivot_longer() and the fallback setup helped me see how flexible R can be once you understand what it’s doing. I also liked that this project felt like real-world work — the kind of data cleanup that happens before you can analyze or visualize anything. Even though I ran into a few technical issues (like file paths and chunk setup), getting through them made me more confident about how RMarkdown actually runs code behind the scenes.

Overall, I feel like I’m learning how to think more like a data analyst — breaking problems into smaller steps, checking for errors, and not rushing the process. I can honestly say that even when it got frustrating, I learned something from every mistake.

Notes

All code above transforms three wide datasets (Sales, Scores, and Vaccinations)
into tidy formats and summarizes them for analysis.

Outputs are saved to CSVs and are ready for visualization, reporting, or further analysis.

Author: Kevin Martin
Course: DATA 607 – Data Acquisition & Management
Project 2: Data Transformation
Date: 2025-10-05