library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Project 2 - Dataset 1

Tidying

Load the Dataset

data1 <- read.csv("https://raw.githubusercontent.com/mraynolds/data_607/refs/heads/main/data_607_project_2_dataset_1.csv")

glimpse(data1)
## Rows: 3
## Columns: 7
## $ City      <chr> "New York", "Los Angeles", "Chicago"
## $ Temp_Jan  <chr> "32°F", "58°F", "28°F"
## $ Temp_Feb  <chr> "35°F", "60°F", "30°F"
## $ Temp_Mar  <chr> "42°F", "65°F", "40°F"
## $ Humid_Jan <chr> "75%", "65%", "80%"
## $ Humid_Feb <chr> "72%", "63%", "78%"
## $ Humid_Mar <chr> "68%", "60%", "75%"

Pivot data longer

The dataset has one primary issue keeping it from being tidy. The header columns are combinations of variables, both a month and an environmental measurement (temperature and humidity).

The following code pivots the data longer while separating the column names into their separate components.

data1 <- data1 |> 
  pivot_longer(
    cols = !City,
    names_to = c("environment", "month"),
    names_sep = "_",
    values_to = "measurement"
  )

glimpse(data1)
## Rows: 18
## Columns: 4
## $ City        <chr> "New York", "New York", "New York", "New York", "New York"…
## $ environment <chr> "Temp", "Temp", "Temp", "Humid", "Humid", "Humid", "Temp",…
## $ month       <chr> "Jan", "Feb", "Mar", "Jan", "Feb", "Mar", "Jan", "Feb", "M…
## $ measurement <chr> "32°F", "35°F", "42°F", "75%", "72%", "68%", "58°F", "60°F…

Create a factor for months

month_levels <- c(
  "Jan", "Feb", "Mar", "Apr", "May", "Jun",
  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)

Clean up the data

The data now could use some cleaning.

The code block below does the following:

  • renames “Humid” to “humidity_%” to indicate that the measurement is “percent humidity”
  • renames “Temp” to “temp_f” to indicate that the measurement is “Temperature in Fahrenheit”
  • removes all unit measurements from the measurement column (degrees Fahrenheit and percent symbol) so that the digits can be used as numbers
  • converts the measurement column to numeric
data_long_1 <- data1 |>
  mutate(
    environment = str_replace_all(environment,"Humid","humidity_pct"),
    environment = str_replace_all(environment,"Temp","temp_f"),
    measurement = parse_number(measurement),
    measurement = as.numeric(measurement),
    month = factor(month, levels = month_levels)) |> 
  rename(city = City)

glimpse(data_long_1)
## Rows: 18
## Columns: 4
## $ city        <chr> "New York", "New York", "New York", "New York", "New York"…
## $ environment <chr> "temp_f", "temp_f", "temp_f", "humidity_pct", "humidity_pc…
## $ month       <fct> Jan, Feb, Mar, Jan, Feb, Mar, Jan, Feb, Mar, Jan, Feb, Mar…
## $ measurement <dbl> 32, 35, 42, 75, 72, 68, 58, 60, 65, 65, 63, 60, 28, 30, 40…

Pivot Wider

The data in the previous data frame, “data_long_1”, is still not tidy as there are two types of data in the measurement column, humidity and temperature.

The following code block pivots the data wider so that temperature and humidity are separated into their own columns.

data_tidy_1 <- data_long_1 |> 
  pivot_wider(
    names_from = environment,
    values_from = measurement
  )

glimpse(data_tidy_1)
## Rows: 9
## Columns: 4
## $ city         <chr> "New York", "New York", "New York", "Los Angeles", "Los A…
## $ month        <fct> Jan, Feb, Mar, Jan, Feb, Mar, Jan, Feb, Mar
## $ temp_f       <dbl> 32, 35, 42, 58, 60, 65, 28, 30, 40
## $ humidity_pct <dbl> 75, 72, 68, 65, 63, 60, 80, 78, 75

The data is now tidy and ready for analysis.

Analysis

Averages

The code block below calculates the average temperature and humidity for the dataset, first by month, and then by city.

data_tidy_1 |>
  group_by(month) |> 
  summarise(
    avg_temp_f = round(mean(temp_f),1),
    avg_humidity_pct = round(mean(humidity_pct),1)
  )
## # A tibble: 3 × 3
##   month avg_temp_f avg_humidity_pct
##   <fct>      <dbl>            <dbl>
## 1 Jan         39.3             73.3
## 2 Feb         41.7             71  
## 3 Mar         49               67.7
data_tidy_1 |>
  group_by(city) |> 
  summarise(
    avg_temp_f = round(mean(temp_f),1),
    avg_humidity_pct = round(mean(humidity_pct),1)
  )
## # A tibble: 3 × 3
##   city        avg_temp_f avg_humidity_pct
##   <chr>            <dbl>            <dbl>
## 1 Chicago           32.7             77.7
## 2 Los Angeles       61               62.7
## 3 New York          36.3             71.7

Plot

A plot of temperature over time.

ggplot(data_tidy_1, aes(x = month, y = temp_f, group = city, color = city)) + 
  geom_line()

A plot of humidty over time.

ggplot(data_tidy_1, aes(x = month, y = humidity_pct, group = city, color = city)) + 
  geom_line()

# Plot of temp and humidity together

ggplot(data_tidy_1, aes(x = month)) + geom_line(aes(y = humidity_pct, group = city, color = city)) +
  geom_line(aes(y = temp_f, group = city, color = city)) +
  scale_y_continuous(name = "Percent Humidity", sec.axis = sec_axis(~., name = "Temperature in Fahrenheit"))