Project 2

Tidying Datasets

Import Packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

Untidy Data 1

Travel Expense Report Posted by: Escarlet Gabriel Vicente Data Analysis Suggestions: “If tidied, the data could be used to analyze average expenses by city, compare spending across categories, or track changes in daily travel costs.” Tidy Data

# Import untidy data
expense_report <- read.csv("Travel Expense Report.csv")

# Remove unnecessary rows and columns
expense_report <- expense_report %>%
  filter(!(X %in% c("Totals", "subtotals"))) %>% #remove rows
  select(-"subtotals") %>% #remove columns
  rename(Dates = X) %>% # rename X column
  mutate(Location=NA) # Add extra column for Location

# Fill Location column
expense_report <- expense_report %>%
  mutate(
    # Check if the Dates column looks like a date or not
    is_location = is.na(dmy(Dates)),
    
    # If it's a location, store it; else NA
    Location = ifelse(is_location, Dates, NA)
  ) %>%
  fill(Location) %>%
  filter(!is_location) %>% # remove location rows
  select(-is_location) # remove helper column

# Convert dates to Dates
expense_report <- expense_report %>%
  mutate(Dates = as.Date(expense_report$Dates, "%d-%b-%y"))

Perform Data Analysis

# 1. Average spending per location by category
expense_report %>%
  group_by(Location) %>%
  summarise(
    avg_meals = mean(Meals),
    avg_hotels = mean(Hotels),
    avg_transport = mean(Transport)
  )
## # A tibble: 2 × 4
##   Location avg_meals avg_hotels avg_transport
##   <chr>        <dbl>      <dbl>         <dbl>
## 1 San Jose      32.5        112            45
## 2 Seattle       65.6        109            36
# 2. Total spending per location by category
expense_report %>%
  group_by(Location) %>%
  summarise(
    total_Meals = sum(Meals),
    total_Hotels = sum(Hotels),
    total_Transport = sum(Transport)
  )
## # A tibble: 2 × 4
##   Location total_Meals total_Hotels total_Transport
##   <chr>          <dbl>        <int>           <int>
## 1 San Jose        65.0          224              90
## 2 Seattle        131.           218              72

Untidy Data 2

US Labor Statistics Posted by: Jacob Shapiro Data Analysis Suggestions: “An analysis that could be done is which county in each state has the highest difference between”Civilian labor force” and “Unemployed”. Could also use a graph to show percent of unemployment per state.”

# Import US Labor Statistics CSV, skip headers
us_labor <- read.csv("US Labor Statistics.csv", skip=3)

# Remove last 4 columns which are percentages
us_labor <- us_labor %>%
  select(-c("X2024.2", "X2025.1", "X2024.3", "X2025.p..1"))


# Temporary column names
colnames(us_labor) <- c("City", "Civilian Labor Force_July 2024", 
                        "Civilian Labor Force_July 2025", 
                        "Civilian Labor Force_August 2024", 
                        "Civilian Labor Force_August 2025", 
                        "Unemployed_July 2024",
                        "Unemployed_July 2025", 
                        "Unemployed_August 2024",
                        "Unemployed_August 2025")

# Reshape to long
us_labor <- us_labor %>%
  pivot_longer(
    cols = -"City",
    names_to = "Category_Month_Year",
    values_to = "Value"
  )

# Extract information from "Category_Month_Year" column
us_labor <- us_labor %>%
  separate(Category_Month_Year,
           into = c("Category", "Month_Year"),
           sep = "_", extra = "merge", fill = "right")

Separate State and City columns

us_labor$State <- NA

us_labor <- us_labor %>%
  mutate(
    # Check if the States column is actually state
    is_state = City %in% state.name,
    
    # If yes, store it; else NA
    State = ifelse(is_state, City, NA)
  ) %>%
  fill(State) %>%
  filter(!is_state) %>% # remove location rows
  select(-is_state) # remove helper column


# Make a little wider and clean Month_Year
us_labor <- us_labor %>%
  pivot_wider(
    names_from = Category,
    values_from = Value
  ) %>% 
  select(State, City, everything()) # reorder columns

Data Analysis

# Making plot
ggplot(us_labor, aes(x = Month_Year, y = Unemployed, color = City, group = City)) +
  geom_line() +
  labs(
    title = "Unemployment Trend by City",
    x = "Month",
    y = "Unemployed"
  )

#Making plot for one month
us_labor %>%
  filter(Month_Year == "July 2024") %>%
  ggplot(aes(x = reorder(City, Unemployed), y = Unemployed, fill = State)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Unemployed by City (August 2025)",
    x = "City",
    y = "Unemployed"
  )

Untidy Data 3

Country Data Posted by: Taha Malik

Data Analysis Suggestions: “Compare GDP growth across countries over time. Analyze population growth rates and their relationship with GDP changes. Create visualizations such as line charts showing GDP per capita trends from 2000 to 2010.”

country_data <- read.csv("Country Data.csv")

colnames(country_data) <- c("Country", "2000 Population", "2000 GDP",
                            "2005 Population", "2005 GDP",
                            "2010 Population", "2010 GDP")

# Reshape to long
country_data <- country_data %>%
  pivot_longer(
    cols = -"Country",
    names_to = "Year_Category",
    values_to = "Value"
  )

# Extract information from "Year_Category" column
country_data <- country_data %>%
  separate(Year_Category,
           into = c("Year", "Category"),
           sep = " ", extra = "merge", fill = "right")

# Make a little wider
country_data <- country_data %>%
  pivot_wider(
    names_from = Category,
    values_from = Value
  )

Data Analysis

ggplot(country_data, aes(x = Year, y = GDP, color = Country, group = Country)) + 
  geom_line(size=2) +
  labs(
    title = "GDP by Country",
    x = "Year",
    y = "GDP"
  )