library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data1 <- read.csv("https://raw.githubusercontent.com/mraynolds/data_607/refs/heads/main/data_607_project_2_dataset_1.csv")
glimpse(data1)
## Rows: 3
## Columns: 7
## $ City <chr> "New York", "Los Angeles", "Chicago"
## $ Temp_Jan <chr> "32°F", "58°F", "28°F"
## $ Temp_Feb <chr> "35°F", "60°F", "30°F"
## $ Temp_Mar <chr> "42°F", "65°F", "40°F"
## $ Humid_Jan <chr> "75%", "65%", "80%"
## $ Humid_Feb <chr> "72%", "63%", "78%"
## $ Humid_Mar <chr> "68%", "60%", "75%"
The dataset has one primary issue keeping it from being tidy. The header columns are combinations of variables, both a month and an environmental measurement (temperature and humidity).
The following code pivots the data longer while separating the column names into their separate components.
data1 <- data1 |>
pivot_longer(
cols = !City,
names_to = c("environment", "month"),
names_sep = "_",
values_to = "measurement"
)
glimpse(data1)
## Rows: 18
## Columns: 4
## $ City <chr> "New York", "New York", "New York", "New York", "New York"…
## $ environment <chr> "Temp", "Temp", "Temp", "Humid", "Humid", "Humid", "Temp",…
## $ month <chr> "Jan", "Feb", "Mar", "Jan", "Feb", "Mar", "Jan", "Feb", "M…
## $ measurement <chr> "32°F", "35°F", "42°F", "75%", "72%", "68%", "58°F", "60°F…
month_levels <- c(
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)
The data now could use some cleaning.
The code block below does the following:
data_long_1 <- data1 |>
mutate(
environment = str_replace_all(environment,"Humid","humidity_pct"),
environment = str_replace_all(environment,"Temp","temp_f"),
measurement = parse_number(measurement),
measurement = as.numeric(measurement),
month = factor(month, levels = month_levels)) |>
rename(city = City)
glimpse(data_long_1)
## Rows: 18
## Columns: 4
## $ city <chr> "New York", "New York", "New York", "New York", "New York"…
## $ environment <chr> "temp_f", "temp_f", "temp_f", "humidity_pct", "humidity_pc…
## $ month <fct> Jan, Feb, Mar, Jan, Feb, Mar, Jan, Feb, Mar, Jan, Feb, Mar…
## $ measurement <dbl> 32, 35, 42, 75, 72, 68, 58, 60, 65, 65, 63, 60, 28, 30, 40…
The data in the previous data frame, “data_long_1”, is still not tidy as there are two types of data in the measurement column, humidity and temperature.
The following code block pivots the data wider so that temperature and humidity are separated into their own columns.
data_tidy_1 <- data_long_1 |>
pivot_wider(
names_from = environment,
values_from = measurement
)
glimpse(data_tidy_1)
## Rows: 9
## Columns: 4
## $ city <chr> "New York", "New York", "New York", "Los Angeles", "Los A…
## $ month <fct> Jan, Feb, Mar, Jan, Feb, Mar, Jan, Feb, Mar
## $ temp_f <dbl> 32, 35, 42, 58, 60, 65, 28, 30, 40
## $ humidity_pct <dbl> 75, 72, 68, 65, 63, 60, 80, 78, 75
The data is now tidy and ready for analysis.
The code block below calculates the average temperature and humidity for the dataset, first by month, and then by city.
data_tidy_1 |>
group_by(month) |>
summarise(
avg_temp_f = round(mean(temp_f),1),
avg_humidity_pct = round(mean(humidity_pct),1)
)
## # A tibble: 3 × 3
## month avg_temp_f avg_humidity_pct
## <fct> <dbl> <dbl>
## 1 Jan 39.3 73.3
## 2 Feb 41.7 71
## 3 Mar 49 67.7
data_tidy_1 |>
group_by(city) |>
summarise(
avg_temp_f = round(mean(temp_f),1),
avg_humidity_pct = round(mean(humidity_pct),1)
)
## # A tibble: 3 × 3
## city avg_temp_f avg_humidity_pct
## <chr> <dbl> <dbl>
## 1 Chicago 32.7 77.7
## 2 Los Angeles 61 62.7
## 3 New York 36.3 71.7
A plot of temperature over time.
ggplot(data_tidy_1, aes(x = month, y = temp_f, group = city, color = city)) +
geom_line()
A plot of humidty over time.
ggplot(data_tidy_1, aes(x = month, y = humidity_pct, group = city, color = city)) +
geom_line()
# Plot of temp and humidity together
ggplot(data_tidy_1, aes(x = month)) + geom_line(aes(y = humidity_pct, group = city, color = city)) +
geom_line(aes(y = temp_f, group = city, color = city)) +
scale_y_continuous(name = "Percent Humidity", sec.axis = sec_axis(~., name = "Temperature in Fahrenheit"))