# Load Required Libraries
library(tidyverse) # Includes dplyr and tidyr
## Warning: package 'dplyr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr) # For reading CSV files
# Read the CSV file
weather_data <- read_csv("Weather_Data.csv")
## Rows: 3 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): City, Temp_Jan, Temp_Feb, Temp_Mar, Humid_Jan, Humid_Feb, Humid_Mar
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Print the CSV file
print(weather_data)
## # A tibble: 3 × 7
## City Temp_Jan Temp_Feb Temp_Mar Humid_Jan Humid_Feb Humid_Mar
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 New York 32°F 35°F 42°F 75% 72% 68%
## 2 Los Angeles 58°F 60°F 65°F 65% 63% 60%
## 3 Chicago 28°F 30°F 40°F 80% 78% 75%
# Convert to long format using pivot_longer
weather_tidy <- weather_data %>%
pivot_longer(
cols = -City,
names_to = c("Measure", "Month"),
names_sep = "_",
values_to = "Value"
) %>%
mutate(
Value = as.numeric(str_replace(Value, "°F|%", "")), # Convert to numeric
Month = factor(Month, levels = c("Jan", "Feb", "Mar")) # Order months
)
# Display cleaned data
head(weather_tidy)
## # A tibble: 6 × 4
## City Measure Month Value
## <chr> <chr> <fct> <dbl>
## 1 New York Temp Jan 32
## 2 New York Temp Feb 35
## 3 New York Temp Mar 42
## 4 New York Humid Jan 75
## 5 New York Humid Feb 72
## 6 New York Humid Mar 68
print(weather_tidy)
## # A tibble: 18 × 4
## City Measure Month Value
## <chr> <chr> <fct> <dbl>
## 1 New York Temp Jan 32
## 2 New York Temp Feb 35
## 3 New York Temp Mar 42
## 4 New York Humid Jan 75
## 5 New York Humid Feb 72
## 6 New York Humid Mar 68
## 7 Los Angeles Temp Jan 58
## 8 Los Angeles Temp Feb 60
## 9 Los Angeles Temp Mar 65
## 10 Los Angeles Humid Jan 65
## 11 Los Angeles Humid Feb 63
## 12 Los Angeles Humid Mar 60
## 13 Chicago Temp Jan 28
## 14 Chicago Temp Feb 30
## 15 Chicago Temp Mar 40
## 16 Chicago Humid Jan 80
## 17 Chicago Humid Feb 78
## 18 Chicago Humid Mar 75
#transforms the long-format data back into a wide format using pivot wider
weather_clean <- weather_tidy %>%
pivot_wider(names_from = Measure, values_from = Value)
# Display cleaned data
head(weather_clean)
## # A tibble: 6 × 4
## City Month Temp Humid
## <chr> <fct> <dbl> <dbl>
## 1 New York Jan 32 75
## 2 New York Feb 35 72
## 3 New York Mar 42 68
## 4 Los Angeles Jan 58 65
## 5 Los Angeles Feb 60 63
## 6 Los Angeles Mar 65 60
# Temperature Trends
ggplot(weather_clean, aes(x = Month, y = Temp, group = City, color = City)) +
geom_line(size = 1) +
geom_point(size = 3) +
labs(title = "Temperature Trends (Jan-Mar)", y = "Temperature (°F)") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Humidity Trends
ggplot(weather_clean, aes(x = Month, y = Humid, group = City, color = City)) +
geom_line(size = 1) +
geom_point(size = 3) +
labs(title = "Humidity Trends (Jan-Mar)", y = "Humidity (%)") +
theme_minimal()

# Calculate mean temperature and humidity per city
city_summary <- weather_clean %>%
group_by(City) %>%
summarise(
Avg_Temp = mean(Temp, na.rm = TRUE),
Avg_Humid = mean(Humid, na.rm = TRUE)
)
# Display summary
print(city_summary)
## # A tibble: 3 × 3
## City Avg_Temp Avg_Humid
## <chr> <dbl> <dbl>
## 1 Chicago 32.7 77.7
## 2 Los Angeles 61 62.7
## 3 New York 36.3 71.7
#as we can see Los Angeles is a much warmer city than NY or Chicago. Interestingly, Chicago is a more humid city in the Jan-Mar months, followed by NY and then LA.
## Dataset 2: Student Test Scores
library(dplyr)
library(tidyr)
student_scores <- read.csv("Student_Scores.csv")
# View original data
head(student_scores)
## Student Math_2021 Math_2022 Reading_2021 Reading_2022
## 1 Alice 88 90 85 88
## 2 Bob 92 89 90 91
## 3 Charlie 79 81 78 80
## 4 Diana 95 94 92 93
## 5 Ethan 85 87 84 86
### Tidying Student Scores
# Convert wide format to long format by separating subject and year into two columns
tidy_scores <- student_scores %>%
pivot_longer(cols = -Student,
names_to = c("Subject", "Year"),
names_sep = "_",
values_to = "Score")
head(tidy_scores)
## # A tibble: 6 × 4
## Student Subject Year Score
## <chr> <chr> <chr> <int>
## 1 Alice Math 2021 88
## 2 Alice Math 2022 90
## 3 Alice Reading 2021 85
## 4 Alice Reading 2022 88
## 5 Bob Math 2021 92
## 6 Bob Math 2022 89
### Analysis: Average Scores by Year and Subject
# Group by subject and year, then calculate the average score
avg_scores <- tidy_scores %>%
group_by(Subject, Year) %>%
summarise(Average_Score = mean(Score))
## `summarise()` has grouped output by 'Subject'. You can override using the
## `.groups` argument.
print(avg_scores)
## # A tibble: 4 × 3
## # Groups: Subject [2]
## Subject Year Average_Score
## <chr> <chr> <dbl>
## 1 Math 2021 87.8
## 2 Math 2022 88.2
## 3 Reading 2021 85.8
## 4 Reading 2022 87.6
#as we can see here, Math and Reading scores on average increased marginally from 2021 to 2022.
## Dataset 3: City Population Over Time
# Read in the city population dataset
city_pop <- read.csv("City_Population.csv")
# View original data
head(city_pop)
## City Pop_2010 Pop_2015 Pop_2020
## 1 New York 8175133 8550405 8804190
## 2 Los Angeles 3792621 3971883 3980404
## 3 Chicago 2695598 2720546 2716000
## 4 Houston 2099451 2296224 2328000
## 5 Phoenix 1445632 1563025 1680992
### Tidying City Population
# Convert wide format to long format by gathering year-specific population columns
tidy_pop <- city_pop %>%
pivot_longer(cols = starts_with("Pop"),
names_to = "Year",
names_prefix = "Pop_",
values_to = "Population")
head(tidy_pop)
## # A tibble: 6 × 3
## City Year Population
## <chr> <chr> <int>
## 1 New York 2010 8175133
## 2 New York 2015 8550405
## 3 New York 2020 8804190
## 4 Los Angeles 2010 3792621
## 5 Los Angeles 2015 3971883
## 6 Los Angeles 2020 3980404
### Analysis: Population Growth from 2010 to 2020
# Group by city and calculate the population growth as the difference between max and min years
pop_growth <- tidy_pop %>%
group_by(City) %>%
arrange(Year) %>%
summarise(Growth = max(Population) - min(Population))
print(pop_growth)
## # A tibble: 5 × 2
## City Growth
## <chr> <int>
## 1 Chicago 24948
## 2 Houston 228549
## 3 Los Angeles 187783
## 4 New York 629057
## 5 Phoenix 235360
#New York had a higher population growth than any other city, in some cases as much as 2.5X as many inhabitants added, Chicago was quite low at 25K