Tropical Storm Data Wrangling

##task 1.1

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.1
## Warning: package 'tidyr' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.1
## Warning: package 'purrr' was built under R version 4.5.2
## Warning: package 'dplyr' was built under R version 4.5.1
## Warning: package 'stringr' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data=storms
glimpse(data)
## Rows: 19,537
## Columns: 13
## $ name                         <chr> "Amy", "Amy", "Amy", "Amy", "Amy", "Amy",…
## $ year                         <dbl> 1975, 1975, 1975, 1975, 1975, 1975, 1975,…
## $ month                        <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,…
## $ day                          <int> 27, 27, 27, 27, 28, 28, 28, 28, 29, 29, 2…
## $ hour                         <dbl> 0, 6, 12, 18, 0, 6, 12, 18, 0, 6, 12, 18,…
## $ lat                          <dbl> 27.5, 28.5, 29.5, 30.5, 31.5, 32.4, 33.3,…
## $ long                         <dbl> -79.0, -79.0, -79.0, -79.0, -78.8, -78.7,…
## $ status                       <fct> tropical depression, tropical depression,…
## $ category                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wind                         <int> 25, 25, 25, 25, 25, 25, 25, 30, 35, 40, 4…
## $ pressure                     <int> 1013, 1013, 1013, 1013, 1012, 1012, 1011,…
## $ tropicalstorm_force_diameter <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ hurricane_force_diameter     <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
str(data)
## tibble [19,537 × 13] (S3: tbl_df/tbl/data.frame)
##  $ name                        : chr [1:19537] "Amy" "Amy" "Amy" "Amy" ...
##  $ year                        : num [1:19537] 1975 1975 1975 1975 1975 ...
##  $ month                       : num [1:19537] 6 6 6 6 6 6 6 6 6 6 ...
##  $ day                         : int [1:19537] 27 27 27 27 28 28 28 28 29 29 ...
##  $ hour                        : num [1:19537] 0 6 12 18 0 6 12 18 0 6 ...
##  $ lat                         : num [1:19537] 27.5 28.5 29.5 30.5 31.5 32.4 33.3 34 34.4 34 ...
##  $ long                        : num [1:19537] -79 -79 -79 -79 -78.8 -78.7 -78 -77 -75.8 -74.8 ...
##  $ status                      : Factor w/ 9 levels "disturbance",..: 7 7 7 7 7 7 7 7 8 8 ...
##  $ category                    : num [1:19537] NA NA NA NA NA NA NA NA NA NA ...
##  $ wind                        : int [1:19537] 25 25 25 25 25 25 25 30 35 40 ...
##  $ pressure                    : int [1:19537] 1013 1013 1013 1013 1012 1012 1011 1006 1004 1002 ...
##  $ tropicalstorm_force_diameter: int [1:19537] NA NA NA NA NA NA NA NA NA NA ...
##  $ hurricane_force_diameter    : int [1:19537] NA NA NA NA NA NA NA NA NA NA ...

##task 1.2 ##The storms dataset contains over 19,537 rows and 13 columns. Each row represents a single recorded observation of a storm at a specific date and time, including information such as its name, location, wind speed, pressure, and status. ##Two potential data quality issues are the presence of missing values in the pressure variable and the fact that some storms change status over time, which requires careful filtering during analysis. #task 2.1 (filtering and selecting)

hurricane_subset <- storms %>%
  filter(status == "hurricane", year > 2000) %>%
  select(name, year, month, day, category, wind)
print(hurricane_subset)
## # A tibble: 2,506 × 6
##    name   year month   day category  wind
##    <chr> <dbl> <dbl> <int>    <dbl> <int>
##  1 Erin   2001     9     9        1    75
##  2 Erin   2001     9     9        2    90
##  3 Erin   2001     9     9        2    95
##  4 Erin   2001     9     9        3   105
##  5 Erin   2001     9    10        3   105
##  6 Erin   2001     9    10        3   105
##  7 Erin   2001     9    10        3   100
##  8 Erin   2001     9    10        2    90
##  9 Erin   2001     9    11        1    80
## 10 Erin   2001     9    11        1    80
## # ℹ 2,496 more rows

#task 2.2 (mutating)

hurricane_subset <- hurricane_subset %>%
  mutate(wind_kph = wind * 1.852)
print(hurricane_subset)
## # A tibble: 2,506 × 7
##    name   year month   day category  wind wind_kph
##    <chr> <dbl> <dbl> <int>    <dbl> <int>    <dbl>
##  1 Erin   2001     9     9        1    75     139.
##  2 Erin   2001     9     9        2    90     167.
##  3 Erin   2001     9     9        2    95     176.
##  4 Erin   2001     9     9        3   105     194.
##  5 Erin   2001     9    10        3   105     194.
##  6 Erin   2001     9    10        3   105     194.
##  7 Erin   2001     9    10        3   100     185.
##  8 Erin   2001     9    10        2    90     167.
##  9 Erin   2001     9    11        1    80     148.
## 10 Erin   2001     9    11        1    80     148.
## # ℹ 2,496 more rows

#task 2.3 (summarizing)

storm_of_the_century <- storms %>%
  filter(year >= 2010, status == "hurricane") %>%
  group_by(year, name) %>%
  summarise(
    max_wind = max(wind, na.rm = TRUE),
    min_pressure = min(pressure, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(year, desc(max_wind))
print(storm_of_the_century)
## # A tibble: 102 × 4
##     year name     max_wind min_pressure
##    <dbl> <chr>       <int>        <int>
##  1  2010 Igor          135          924
##  2  2010 Earl          125          927
##  3  2010 Julia         120          948
##  4  2010 Danielle      115          942
##  5  2010 Karl          110          956
##  6  2010 Alex           95          946
##  7  2010 Paula          90          981
##  8  2010 Richard        85          977
##  9  2010 Tomas          85          982
## 10  2010 Lisa           75          982
## # ℹ 92 more rows

##task 3.1 (pivoting)

storm_long <- storms %>%
  select(category, wind, pressure) %>%
  pivot_longer(
    cols = c(wind, pressure),
    names_to = "metric",
    values_to = "value"
  )
print(storm_long)
## # A tibble: 39,074 × 3
##    category metric   value
##       <dbl> <chr>    <int>
##  1       NA wind        25
##  2       NA pressure  1013
##  3       NA wind        25
##  4       NA pressure  1013
##  5       NA wind        25
##  6       NA pressure  1013
##  7       NA wind        25
##  8       NA pressure  1013
##  9       NA wind        25
## 10       NA pressure  1012
## # ℹ 39,064 more rows

##task 3.2 (unite)

storms_with_date <- storms %>%
  unite(date_id, year, month, day, sep = "-", remove = FALSE)
storms_with_date

#part 4 (interpretation) ##Based on the results from Task 2.3, the year with the most intense storm activity is the year that recorded the highest maximum wind speed along with the lowest minimum pressure among hurricanes since 2010. This combination indicates stronger and more intense storms, as higher wind speeds and lower pressure are key indicators of hurricane severity. The summarized data clearly shows that (2010 Igor 135 924)the year 2010 is standing out compared to others.