# Load Required Libraries
library(tidyverse)  # Includes dplyr and tidyr
## Warning: package 'dplyr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)      # For reading CSV files

# Read the CSV file
weather_data <- read_csv("Weather_Data.csv")
## Rows: 3 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): City, Temp_Jan, Temp_Feb, Temp_Mar, Humid_Jan, Humid_Feb, Humid_Mar
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Print the CSV file
print(weather_data)
## # A tibble: 3 × 7
##   City        Temp_Jan Temp_Feb Temp_Mar Humid_Jan Humid_Feb Humid_Mar
##   <chr>       <chr>    <chr>    <chr>    <chr>     <chr>     <chr>    
## 1 New York    32°F     35°F     42°F     75%       72%       68%      
## 2 Los Angeles 58°F     60°F     65°F     65%       63%       60%      
## 3 Chicago     28°F     30°F     40°F     80%       78%       75%
# Convert to long format using pivot_longer
weather_tidy <- weather_data %>%
  pivot_longer(
    cols = -City, 
    names_to = c("Measure", "Month"),
    names_sep = "_",
    values_to = "Value"
  ) %>%
  mutate(
    Value = as.numeric(str_replace(Value, "°F|%", "")),  # Convert to numeric
    Month = factor(Month, levels = c("Jan", "Feb", "Mar")) # Order months
  )

# Display cleaned data
head(weather_tidy)
## # A tibble: 6 × 4
##   City     Measure Month Value
##   <chr>    <chr>   <fct> <dbl>
## 1 New York Temp    Jan      32
## 2 New York Temp    Feb      35
## 3 New York Temp    Mar      42
## 4 New York Humid   Jan      75
## 5 New York Humid   Feb      72
## 6 New York Humid   Mar      68
print(weather_tidy)
## # A tibble: 18 × 4
##    City        Measure Month Value
##    <chr>       <chr>   <fct> <dbl>
##  1 New York    Temp    Jan      32
##  2 New York    Temp    Feb      35
##  3 New York    Temp    Mar      42
##  4 New York    Humid   Jan      75
##  5 New York    Humid   Feb      72
##  6 New York    Humid   Mar      68
##  7 Los Angeles Temp    Jan      58
##  8 Los Angeles Temp    Feb      60
##  9 Los Angeles Temp    Mar      65
## 10 Los Angeles Humid   Jan      65
## 11 Los Angeles Humid   Feb      63
## 12 Los Angeles Humid   Mar      60
## 13 Chicago     Temp    Jan      28
## 14 Chicago     Temp    Feb      30
## 15 Chicago     Temp    Mar      40
## 16 Chicago     Humid   Jan      80
## 17 Chicago     Humid   Feb      78
## 18 Chicago     Humid   Mar      75
#transforms the long-format data back into a wide format using pivot wider
weather_clean <- weather_tidy %>%
  pivot_wider(names_from = Measure, values_from = Value)

# Display cleaned data
head(weather_clean)
## # A tibble: 6 × 4
##   City        Month  Temp Humid
##   <chr>       <fct> <dbl> <dbl>
## 1 New York    Jan      32    75
## 2 New York    Feb      35    72
## 3 New York    Mar      42    68
## 4 Los Angeles Jan      58    65
## 5 Los Angeles Feb      60    63
## 6 Los Angeles Mar      65    60
# Temperature Trends
ggplot(weather_clean, aes(x = Month, y = Temp, group = City, color = City)) +
  geom_line(size = 1) +
  geom_point(size = 3) +
  labs(title = "Temperature Trends (Jan-Mar)", y = "Temperature (°F)") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Humidity Trends
ggplot(weather_clean, aes(x = Month, y = Humid, group = City, color = City)) +
  geom_line(size = 1) +
  geom_point(size = 3) +
  labs(title = "Humidity Trends (Jan-Mar)", y = "Humidity (%)") +
  theme_minimal()

# Calculate mean temperature and humidity per city
city_summary <- weather_clean %>%
  group_by(City) %>%
  summarise(
    Avg_Temp = mean(Temp, na.rm = TRUE),
    Avg_Humid = mean(Humid, na.rm = TRUE)
  )

# Display summary
print(city_summary)
## # A tibble: 3 × 3
##   City        Avg_Temp Avg_Humid
##   <chr>          <dbl>     <dbl>
## 1 Chicago         32.7      77.7
## 2 Los Angeles     61        62.7
## 3 New York        36.3      71.7
#as we can see Los Angeles is a much warmer city than NY or Chicago. Interestingly, Chicago is a more humid city in the Jan-Mar months, followed by NY and then LA.
## Dataset 2: Student Test Scores
library(dplyr)
library(tidyr)
student_scores <- read.csv("Student_Scores.csv")

# View original data
head(student_scores)
##   Student Math_2021 Math_2022 Reading_2021 Reading_2022
## 1   Alice        88        90           85           88
## 2     Bob        92        89           90           91
## 3 Charlie        79        81           78           80
## 4   Diana        95        94           92           93
## 5   Ethan        85        87           84           86
### Tidying Student Scores
# Convert wide format to long format by separating subject and year into two columns
tidy_scores <- student_scores %>%
  pivot_longer(cols = -Student,
               names_to = c("Subject", "Year"),
               names_sep = "_",
               values_to = "Score")

head(tidy_scores)
## # A tibble: 6 × 4
##   Student Subject Year  Score
##   <chr>   <chr>   <chr> <int>
## 1 Alice   Math    2021     88
## 2 Alice   Math    2022     90
## 3 Alice   Reading 2021     85
## 4 Alice   Reading 2022     88
## 5 Bob     Math    2021     92
## 6 Bob     Math    2022     89
### Analysis: Average Scores by Year and Subject

# Group by subject and year, then calculate the average score
avg_scores <- tidy_scores %>%
  group_by(Subject, Year) %>%
  summarise(Average_Score = mean(Score))
## `summarise()` has grouped output by 'Subject'. You can override using the
## `.groups` argument.
print(avg_scores)
## # A tibble: 4 × 3
## # Groups:   Subject [2]
##   Subject Year  Average_Score
##   <chr>   <chr>         <dbl>
## 1 Math    2021           87.8
## 2 Math    2022           88.2
## 3 Reading 2021           85.8
## 4 Reading 2022           87.6
#as we can see here, Math and Reading scores on average increased marginally from 2021 to 2022.
## Dataset 3: City Population Over Time
# Read in the city population dataset
city_pop <- read.csv("City_Population.csv")

# View original data
head(city_pop)
##          City Pop_2010 Pop_2015 Pop_2020
## 1    New York  8175133  8550405  8804190
## 2 Los Angeles  3792621  3971883  3980404
## 3     Chicago  2695598  2720546  2716000
## 4     Houston  2099451  2296224  2328000
## 5     Phoenix  1445632  1563025  1680992
### Tidying City Population
# Convert wide format to long format by gathering year-specific population columns
tidy_pop <- city_pop %>%
  pivot_longer(cols = starts_with("Pop"),
               names_to = "Year",
               names_prefix = "Pop_",
               values_to = "Population")

head(tidy_pop)
## # A tibble: 6 × 3
##   City        Year  Population
##   <chr>       <chr>      <int>
## 1 New York    2010     8175133
## 2 New York    2015     8550405
## 3 New York    2020     8804190
## 4 Los Angeles 2010     3792621
## 5 Los Angeles 2015     3971883
## 6 Los Angeles 2020     3980404
### Analysis: Population Growth from 2010 to 2020
# Group by city and calculate the population growth as the difference between max and min years
pop_growth <- tidy_pop %>%
  group_by(City) %>%
  arrange(Year) %>%
  summarise(Growth = max(Population) - min(Population))

print(pop_growth)
## # A tibble: 5 × 2
##   City        Growth
##   <chr>        <int>
## 1 Chicago      24948
## 2 Houston     228549
## 3 Los Angeles 187783
## 4 New York    629057
## 5 Phoenix     235360
#New York had a higher population growth than any other city, in some cases as much as 2.5X as many inhabitants added, Chicago was quite low at 25K