library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Project 2 - Dataset 1

Tidying

Load the Dataset

data1 <- read.csv("https://raw.githubusercontent.com/mraynolds/data_607/refs/heads/main/data_607_project_2_dataset_1.csv")

head(data1)
##          City Temp_Jan Temp_Feb Temp_Mar Humid_Jan Humid_Feb Humid_Mar
## 1    New York     32°F     35°F     42°F       75%       72%       68%
## 2 Los Angeles     58°F     60°F     65°F       65%       63%       60%
## 3     Chicago     28°F     30°F     40°F       80%       78%       75%

Pivot data longer

The dataset has one primary issue keeping it from being tidy. The header columns are combinations of variables, both a month and an environmental measurement (temperature and humidity).

The following code pivots the data longer while separating the column names into their separate components.

data1 <- data1 |> 
  pivot_longer(
    cols = !City,
    names_to = c("environment", "month"),
    names_sep = "_",
    values_to = "measurement"
  )

head(data1)
## # A tibble: 6 × 4
##   City     environment month measurement
##   <chr>    <chr>       <chr> <chr>      
## 1 New York Temp        Jan   32°F       
## 2 New York Temp        Feb   35°F       
## 3 New York Temp        Mar   42°F       
## 4 New York Humid       Jan   75%        
## 5 New York Humid       Feb   72%        
## 6 New York Humid       Mar   68%

Create a factor for months

month_levels <- c(
  "Jan", "Feb", "Mar", "Apr", "May", "Jun",
  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)

Clean up the data

The data now could use some cleaning.

The code block below does the following:

  • renames “Humid” to “humidity_%” to indicate that the measurement is “percent humidity”
  • renames “Temp” to “temp_f” to indicate that the measurement is “Temperature in Fahrenheit”
  • removes all unit measurements from the measurement column (degrees Fahrenheit and percent symbol) so that the digits can be used as numbers
  • converts the measurement column to numeric
data_long_1 <- data1 |>
  mutate(
    environment = str_replace_all(environment,"Humid","humidity_pct"),
    environment = str_replace_all(environment,"Temp","temp_f"),
    measurement = parse_number(measurement),
    measurement = as.numeric(measurement),
    month = factor(month, levels = month_levels)) |> 
  rename(city = City)

head(data_long_1)
## # A tibble: 6 × 4
##   city     environment  month measurement
##   <chr>    <chr>        <fct>       <dbl>
## 1 New York temp_f       Jan            32
## 2 New York temp_f       Feb            35
## 3 New York temp_f       Mar            42
## 4 New York humidity_pct Jan            75
## 5 New York humidity_pct Feb            72
## 6 New York humidity_pct Mar            68

Pivot Wider

The data in the previous data frame, “data_long_1”, is still not tidy as there are two types of data in the measurement column, humidity and temperature.

The following code block pivots the data wider so that temperature and humidity are separated into their own columns.

data_tidy_1 <- data_long_1 |> 
  pivot_wider(
    names_from = environment,
    values_from = measurement
  )

head(data_tidy_1)
## # A tibble: 6 × 4
##   city        month temp_f humidity_pct
##   <chr>       <fct>  <dbl>        <dbl>
## 1 New York    Jan       32           75
## 2 New York    Feb       35           72
## 3 New York    Mar       42           68
## 4 Los Angeles Jan       58           65
## 5 Los Angeles Feb       60           63
## 6 Los Angeles Mar       65           60

The data is now tidy and ready for analysis.

Analysis

Averages

The code block below calculates the average temperature and humidity for the dataset, first by month, and then by city.

data_tidy_1 |>
  group_by(month) |> 
  summarise(
    avg_temp_f = round(mean(temp_f),1),
    avg_humidity_pct = round(mean(humidity_pct),1)
  )
## # A tibble: 3 × 3
##   month avg_temp_f avg_humidity_pct
##   <fct>      <dbl>            <dbl>
## 1 Jan         39.3             73.3
## 2 Feb         41.7             71  
## 3 Mar         49               67.7
data_tidy_1 |>
  group_by(city) |> 
  summarise(
    avg_temp_f = round(mean(temp_f),1),
    avg_humidity_pct = round(mean(humidity_pct),1)
  )
## # A tibble: 3 × 3
##   city        avg_temp_f avg_humidity_pct
##   <chr>            <dbl>            <dbl>
## 1 Chicago           32.7             77.7
## 2 Los Angeles       61               62.7
## 3 New York          36.3             71.7

Plot

A plot of temperature over time.

ggplot(data_tidy_1, aes(x = month, y = temp_f, group = city, color = city)) + 
  geom_line()

A plot of humidty over time.

ggplot(data_tidy_1, aes(x = month, y = humidity_pct, group = city, color = city)) + 
  geom_line()

# Plot of temp and humidity together

ggplot(data_tidy_1, aes(x = month)) + geom_line(aes(y = humidity_pct, group = city, color = city)) +
  geom_line(aes(y = temp_f, group = city, color = city)) +
  scale_y_continuous(name = "Percent Humidity", sec.axis = sec_axis(~., name = "Temperature in Fahrenheit")) 

Project 2 - Dataset 2

Tidying

Load the Dataset

data2 <- read.csv("https://raw.githubusercontent.com/mraynolds/data_607/refs/heads/main/data_607_project_2_dataset_2.csv")

head(data2)
##   Store.ID        Store Country     January    February       March       April
## 1        1    Palisades      US $371,700.00 $435,950.00 $372,460.00 $192,260.00
## 2        2     Billings      US  $97,530.00 $324,140.00 $454,480.00  $36,810.00
## 3        3    Laguardia      US $346,130.00 $157,510.00 $288,990.00 $358,190.00
## 4        4 Cheeseburger      US $442,010.00 $212,390.00 $183,580.00 $308,650.00
## 5        5      Detroit      US  $33,250.00  $36,840.00 $320,170.00 $242,650.00
## 6        6        Towns      US  $16,632.00  $25,372.00  $38,178.00  $13,222.00
##           May        June        July      August   September     October
## 1 $157,550.00 $332,550.00  $89,630.00 $372,090.00 $421,670.00 $173,010.00
## 2 $219,790.00 $210,970.00  $84,840.00 $175,440.00 $283,710.00 $275,320.00
## 3  $96,860.00 $461,950.00  $80,440.00 $404,990.00 $450,630.00 $327,270.00
## 4 $184,340.00 $156,540.00 $328,180.00 $281,430.00 $498,150.00 $473,150.00
## 5 $350,300.00 $421,980.00 $307,190.00  $16,900.00 $443,990.00 $346,230.00
## 6  $16,031.00  $37,162.00  $22,541.00  $16,700.00  $10,475.00   $1,897.00
##      November
## 1 $173,220.00
## 2 $401,940.00
## 3 $370,100.00
## 4  $23,740.00
## 5 $312,670.00
## 6  $10,493.00

Pivot data longer

The dataset appears to be list of store ids, their location, their country, and a monetary amount listed by month in a wide format.

To start the data will need to be pivoted longer in order take the months out of the column headers and place them into rows.

The following code pivots the data longer into a column for the months and a column named “dollars”, since it is not clear what the monetary data represents and the only indicator is a dollar symbol.

A note on the money: In reality, this would require an investigation into the dataset and the domain knowledge of what the money represents. Furthermore, all columns have dollar signs, but there are several countries listed, including the US and Canada which both use the dollars but different dollars. There are several more countries that use currencies other than the dollar, yet the values listed in their rows are as dollars. For the purposes of this project, we will use the assumption that the dataset is listed exclusively in US dollars since the source data does not specify.

data2 <- data2 |> 
  pivot_longer(
    cols = !(Store.ID:Country),
    names_to = "month",
    values_to = "dollars"
  )

head(data2)
## # A tibble: 6 × 5
##   Store.ID Store     Country month    dollars    
##      <int> <chr>     <chr>   <chr>    <chr>      
## 1        1 Palisades US      January  $371,700.00
## 2        1 Palisades US      February $435,950.00
## 3        1 Palisades US      March    $372,460.00
## 4        1 Palisades US      April    $192,260.00
## 5        1 Palisades US      May      $157,550.00
## 6        1 Palisades US      June     $332,550.00

Make a factor for months

The following code makes an ordinal factor for the months.

month_levels <- c(
  "January", "February", "March", "April", "May", "June",
  "July", "August", "September", "October", "November", "December"
)

Clean up the data frame

The following code cleans up the data frame. It: - renames columns - changes character classes of month to factor, and dollars to a double and removes the currency symbol

data2 <- data2 |> 
  rename(
    store_id = Store.ID,
    store = Store,
    country = Country
  ) |>
  mutate(
    month = factor(month, levels = month_levels),
    dollars = parse_number(dollars),
    store_id = as.character(store_id)
  )

head(data2)
## # A tibble: 6 × 5
##   store_id store     country month    dollars
##   <chr>    <chr>     <chr>   <fct>      <dbl>
## 1 1        Palisades US      January   371700
## 2 1        Palisades US      February  435950
## 3 1        Palisades US      March     372460
## 4 1        Palisades US      April     192260
## 5 1        Palisades US      May       157550
## 6 1        Palisades US      June      332550

Tidy data

The data frame is now clean and tidy. The data is significantly longer as a result but now it is in a form that will allow it to be analyzed easily and consistently.

Analysis

The code below totals all of the monetary data by store and plots and sorts it in descending order. It seems store 3 in Laguardia in the US has the highest monetary information while Chelsea in the UK has the lowest.

total_dollars_by_store <- data2 |> 
  group_by(store_id, store, country) |> 
  summarise(total_dollars = sum(dollars), .groups = "drop_last") |> 
  arrange(desc(total_dollars)) |> 
  print()
## # A tibble: 20 × 4
## # Groups:   store_id, store [20]
##    store_id store        country total_dollars
##    <chr>    <chr>        <chr>           <dbl>
##  1 3        Laguardia    US            3343060
##  2 4        Cheeseburger US            3092160
##  3 1        Palisades    US            3092090
##  4 5        Detroit      US            2832170
##  5 2        Billings     US            2564970
##  6 20       Camembert    France        1047658
##  7 8        Eh           Canada         367044
##  8 9        Maple        Canada         362619
##  9 19       Paolo        Brazil         330761
## 10 14       Innit        UK             315480
## 11 17       Steve        China          308982
## 12 16       Fuji         Japan          283811
## 13 10       Victoria     Canada         264917
## 14 7        Tim Horton's Canada         258845
## 15 15       Hokkaido     Japan          244173
## 16 13       Blimey       UK             240649
## 17 18       Nanjing      China          219850
## 18 6        Towns        US             208703
## 19 12       Wimbledon    UK             199778
## 20 11       Chelsea      UK             191157
ggplot(total_dollars_by_store, aes(x = reorder(store_id, -total_dollars), y = total_dollars)) +
  geom_col()

The following code adds up the monetary data by country and sorts it in descending order. It also calculates the average dollars per store by country. From this data it is clear that the US has the highest total dollars. This is to be expected as it has the most stores. However it also has the higher average dollars per store.

A notable insight from this data is that while France has only one store, it has the third highest total dollars and the second highest average dollars per store.

total_dollars_by_country <- data2 |> 
  group_by(country) |> 
  summarise(
    store_per_country = n()/11,
    total_dollars = sum(dollars),
    avg_dollars_per_store = (total_dollars / store_per_country)
    ) |> 
  arrange(desc(total_dollars)
  ) |> 
  print()
## # A tibble: 7 × 4
##   country store_per_country total_dollars avg_dollars_per_store
##   <chr>               <dbl>         <dbl>                 <dbl>
## 1 US                      6      15133153              2522192.
## 2 Canada                  4       1253425               313356.
## 3 France                  1       1047658              1047658 
## 4 UK                      4        947064               236766 
## 5 China                   2        528832               264416 
## 6 Japan                   2        527984               263992 
## 7 Brazil                  1        330761               330761
ggplot(total_dollars_by_country, aes(x = reorder(country, -total_dollars), y = total_dollars)) +
  geom_col()

ggplot(total_dollars_by_country, aes(x = reorder(country, -avg_dollars_per_store), y = avg_dollars_per_store)) +
  geom_col()

The following code totals the dollars by month. It seems Spetemeber has the highest dollar total while July has the lowest.

dollars_by_month <- data2 |> 
  group_by(month) |> 
  summarise(
    total_dollars = sum(dollars)
  ) |> 
  arrange(desc(total_dollars)) |> 
  print()
## # A tibble: 11 × 2
##    month     total_dollars
##    <fct>             <dbl>
##  1 September       2568518
##  2 November        2040795
##  3 March           1995192
##  4 June            1993274
##  5 October         1955800
##  6 February        1715421
##  7 January         1694355
##  8 August          1612337
##  9 April           1580057
## 10 May             1368979
## 11 July            1244149
ggplot(dollars_by_month, aes(x = month, y = total_dollars)) +
  geom_col()

Project 2 - Dataset 3

Tidying

Load the Dataset

The following code loads the data set into r and fills NA into empty cells.

data3 <- read.csv("https://raw.githubusercontent.com/mraynolds/data_607/refs/heads/main/data_607_project_2_dataset_3.csv", na = c("","NA"))

head(data3)
##   X  Name                Duration        Wind.speed             Pressure
## 1 0   One      June 16 – 26, 1921 90 mph (150 km/h) 980 hPa (28.94 inHg)
## 2 1   Two   September 4 – 8, 1921 80 mph (130 km/h) 985 hPa (29.09 inHg)
## 3 2  Four  September 8 – 14, 1921 90 mph (150 km/h) 979 hPa (28.91 inHg)
## 4 3 Three September 18 – 24, 1922 80 mph (130 km/h) 987 hPa (29.15 inHg)
## 5 4  Four September 10 – 13, 1923 80 mph (130 km/h) 986 hPa (29.12 inHg)
## 6 5   Six   October 12 – 17, 1923 80 mph (130 km/h) 983 hPa (29.03 inHg)
##                            Areas.affected Deaths      Damage REf Category
## 1         Central America, Gulf of Mexico      0        <NA> [2]        1
## 2                                  Mexico    215 $19 million [2]        1
## 3                                    None      0        <NA> [2]        1
## 4                                    None      0        <NA> [2]        1
## 5                            Newfoundland      0        <NA> [2]        1
## 6 Mexico, Gulf Coast of the United States      0        <NA> [2]        1

The code has several issues that keep it from being tidy, and even more issues that keep it from being clean.

To tidy the following transformations will be executed: - Separate duration into a start and end date and create an actual duration column - Separate out environmental metrics (wind speed and pressure) so that each column only has one data type and only contains numeric data - Separate and convert monetary damage into separate columns for notes and numeric values, while eliminating the text based values - Pivot long areas affected so they can be evaluated individually

Most columns also need some additional cleaning.

The code block below begins separating out the date data. The date data is mostly a range of two dates, with very inconsistent formatting. The code below separates the original duration column into individual components for the start month, year, day and the end month, year, day, and a footnote column. The original dataset did not contain the actual footnote.

data3 <- data3 |> 
  select(!X) |> 
  separate_wider_regex(
    Duration,
    patterns = c(
     start_month = "[A-Za-z]+",
     "\\s*",
     start_day = "[0-9]+",
     ",*",
     "\\s*",
     start_year = "[0-9]*",
     "\\W*",
     end_month = "[A-Za-z]*",
     "\\W*",
     end_day = "[0-9]*",
     "\\W*",
     end_year = "[0-9]*",
     "\\s*",
     date_footnote = ".*"
    )
  )

head(data3)
## # A tibble: 6 × 15
##   Name  start_month start_day start_year end_month end_day end_year
##   <chr> <chr>       <chr>     <chr>      <chr>     <chr>   <chr>   
## 1 One   June        16        ""         ""        26      1921    
## 2 Two   September   4         ""         ""        8       1921    
## 3 Four  September   8         ""         ""        14      1921    
## 4 Three September   18        ""         ""        24      1922    
## 5 Four  September   10        ""         ""        13      1923    
## 6 Six   October     12        ""         ""        17      1923    
## # ℹ 8 more variables: date_footnote <chr>, Wind.speed <chr>, Pressure <chr>,
## #   Areas.affected <chr>, Deaths <chr>, Damage <chr>, REf <chr>, Category <int>

The following code copies the year into the start year from the end year columns wherever there is no start year. It also copies the start month into the end month column anywhere that an end month does not already exist.

data3 <- data3 |> mutate(
  start_year = if_else(start_year == "", end_year, start_year),
  end_month = if_else(end_month == "", start_month, end_month),
  end_day = if_else(end_day == "", start_day, end_day),
  end_year = if_else(end_year == "", start_year, end_year),
)

head(data3)
## # A tibble: 6 × 15
##   Name  start_month start_day start_year end_month end_day end_year
##   <chr> <chr>       <chr>     <chr>      <chr>     <chr>   <chr>   
## 1 One   June        16        1921       June      26      1921    
## 2 Two   September   4         1921       September 8       1921    
## 3 Four  September   8         1921       September 14      1921    
## 4 Three September   18        1922       September 24      1922    
## 5 Four  September   10        1923       September 13      1923    
## 6 Six   October     12        1923       October   17      1923    
## # ℹ 8 more variables: date_footnote <chr>, Wind.speed <chr>, Pressure <chr>,
## #   Areas.affected <chr>, Deaths <chr>, Damage <chr>, REf <chr>, Category <int>

The following code block converts the months to their numeric equivalent and then forms individual columns that contain the full start and end date, as well as the difference between the two so that there is a duration column.

The duration column is a nice addition to the data as it automatically displays a summary of what the seconds mean in a more easily read metric.

data3 <- data3 |>
  mutate(
    start_month = match(start_month, month.name),
    start_date = make_date(start_year, start_month, start_day),
    end_month = match(end_month, month.name),
    end_date = make_date(end_year, end_month, end_day),
    duration_days = as.numeric(end_date - start_date)
    ) |> 
  select(!start_month:end_year) |> 
  relocate(start_date:duration_days, .after = Name)

head(data3)
## # A tibble: 6 × 12
##   Name  start_date end_date   duration_days date_footnote Wind.speed    Pressure
##   <chr> <date>     <date>             <dbl> <chr>         <chr>         <chr>   
## 1 One   1921-06-16 1921-06-26            10 ""            90 mph (150 … 980 hPa…
## 2 Two   1921-09-04 1921-09-08             4 ""            80 mph (130 … 985 hPa…
## 3 Four  1921-09-08 1921-09-14             6 ""            90 mph (150 … 979 hPa…
## 4 Three 1922-09-18 1922-09-24             6 ""            80 mph (130 … 987 hPa…
## 5 Four  1923-09-10 1923-09-13             3 ""            80 mph (130 … 986 hPa…
## 6 Six   1923-10-12 1923-10-17             5 ""            80 mph (130 … 983 hPa…
## # ℹ 5 more variables: Areas.affected <chr>, Deaths <chr>, Damage <chr>,
## #   REf <chr>, Category <int>

The following code separates wide wind speed and pressure data so each column only contains a single numeric value.

data3 <- data3 |> separate_wider_regex(Wind.speed,
                           patterns = c(
                             wind_speed_mph = "[0-9]+",
                             "\\D+",
                             wind_speed_km_per_hr = "[0-9]+",
                             "\\D+")) |> 
    mutate(Pressure = str_remove_all(Pressure,",")) |> 
  separate_wider_regex(Pressure,
                       patterns = c(
                         pressure_hPa = "[0-9]+",
                         "\\D+",
                         pressure_inHg = "[0-9+\\.0-9]+",
                         "\\D+"), too_few = "align_start")

head(data3)
## # A tibble: 6 × 14
##   Name  start_date end_date   duration_days date_footnote wind_speed_mph
##   <chr> <date>     <date>             <dbl> <chr>         <chr>         
## 1 One   1921-06-16 1921-06-26            10 ""            90            
## 2 Two   1921-09-04 1921-09-08             4 ""            80            
## 3 Four  1921-09-08 1921-09-14             6 ""            90            
## 4 Three 1922-09-18 1922-09-24             6 ""            80            
## 5 Four  1923-09-10 1923-09-13             3 ""            80            
## 6 Six   1923-10-12 1923-10-17             5 ""            80            
## # ℹ 8 more variables: wind_speed_km_per_hr <chr>, pressure_hPa <chr>,
## #   pressure_inHg <chr>, Areas.affected <chr>, Deaths <chr>, Damage <chr>,
## #   REf <chr>, Category <int>

The following code block cleans up and converts the damages information into numeric values. The formatting is very inconsistent as loaded. This code makes it consistent so that it can be used in data analysis.

data3 <- data3 |> 
  separate_wider_regex(Damage,
                       patterns = c(damage_notes = "^\\[.*|>*",
                                    "\\$?",
                                    damage_amount_dollars = "\\$?\\s*\\d*\\.*\\d*",
                                    "\\s*",
                                    damage_modifier = "[A-Za-z]*"
                                    ), too_few = "align_start")
data3 <- data3 |> mutate(
    damage_amount_dollars = str_remove_all(damage_amount_dollars, "\\$"),
    damage_amount_dollars = as.numeric(damage_amount_dollars),
    damage_amount_dollars = if_else(damage_modifier == "thousand", damage_amount_dollars*1000, damage_amount_dollars),
    damage_amount_dollars = if_else(damage_modifier == "million", damage_amount_dollars*1000000, damage_amount_dollars),
    damage_amount_dollars = if_else(damage_modifier == "billion", damage_amount_dollars*1000000000, damage_amount_dollars),
    damage_notes = if_else(damage_modifier == "Unknown", "Unknown", damage_notes),
    damage_notes = if_else(damage_notes == ">", "Damage greater than damage_amount", damage_notes)
  )

head(data3)
## # A tibble: 6 × 16
##   Name  start_date end_date   duration_days date_footnote wind_speed_mph
##   <chr> <date>     <date>             <dbl> <chr>         <chr>         
## 1 One   1921-06-16 1921-06-26            10 ""            90            
## 2 Two   1921-09-04 1921-09-08             4 ""            80            
## 3 Four  1921-09-08 1921-09-14             6 ""            90            
## 4 Three 1922-09-18 1922-09-24             6 ""            80            
## 5 Four  1923-09-10 1923-09-13             3 ""            80            
## 6 Six   1923-10-12 1923-10-17             5 ""            80            
## # ℹ 10 more variables: wind_speed_km_per_hr <chr>, pressure_hPa <chr>,
## #   pressure_inHg <chr>, Areas.affected <chr>, Deaths <chr>,
## #   damage_notes <chr>, damage_amount_dollars <dbl>, damage_modifier <chr>,
## #   REf <chr>, Category <int>

The code below saves a tidy version of the data set with the column “Areas.affected” and “REf” eliminated. This saves a now tidy data set that can be analyzed for all metrics except areas affected and REf. The areas affected will be pivoted longer later so that areas affected can be analyzed. However they are separated at this stage because when pivoted longer the financial information will be duplicated which could lead to inaccurate analysis of financial impacts. Additionally it has the same potential issue with deaths caused by the hurricanes. Since information about how deaths or financial costs were spread out over the affected areas, the most responsible thing would seem to be to separate out that data prior to pivoting the areas affected longer.

data_damage <- data3 |> 
  select(!Areas.affected) |> 
  select(!REf)

head(data_damage)
## # A tibble: 6 × 14
##   Name  start_date end_date   duration_days date_footnote wind_speed_mph
##   <chr> <date>     <date>             <dbl> <chr>         <chr>         
## 1 One   1921-06-16 1921-06-26            10 ""            90            
## 2 Two   1921-09-04 1921-09-08             4 ""            80            
## 3 Four  1921-09-08 1921-09-14             6 ""            90            
## 4 Three 1922-09-18 1922-09-24             6 ""            80            
## 5 Four  1923-09-10 1923-09-13             3 ""            80            
## 6 Six   1923-10-12 1923-10-17             5 ""            80            
## # ℹ 8 more variables: wind_speed_km_per_hr <chr>, pressure_hPa <chr>,
## #   pressure_inHg <chr>, Deaths <chr>, damage_notes <chr>,
## #   damage_amount_dollars <dbl>, damage_modifier <chr>, Category <int>

The following code separates out the names, dates and REf column. The REf column contains multiple references to footnotes (whose meaning were not included with the dataset). In order to normalize the data storage, this code will move these references to their own data frame and then separate the data longer which will allow the footnotes to be individually referenced without unnecessarily duplicating a significant amount of data in one of the other data frames. This is similar to the need to separate out the monetary damages as well as death information. In a way separating out the data is starting the process of normalizing the data in order to reduce duplication. The name and dates act as a key so the footnotes can be referenced via another dat set.

data_ref <- data3 |> 
  select(Name:end_date,REf) |>
  rename(ref = REf, name = Name) |> 
  mutate(
    ref = str_remove_all(ref, "\\."),
    ref = str_replace_all(ref, "\\]\\[", "\\],\\[")
    ) |> 
  separate_longer_delim(ref,",") |> 
  mutate(
    ref = str_remove_all(ref, "\\["),
    ref = str_remove_all(ref, "\\]")
  )

head(data_ref)
## # A tibble: 6 × 4
##   name  start_date end_date   ref  
##   <chr> <date>     <date>     <chr>
## 1 One   1921-06-16 1921-06-26 2    
## 2 Two   1921-09-04 1921-09-08 2    
## 3 Four  1921-09-08 1921-09-14 2    
## 4 Three 1922-09-18 1922-09-24 2    
## 5 Four  1923-09-10 1923-09-13 2    
## 6 Six   1923-10-12 1923-10-17 2

The following code separates longer the areas affected column and saves the data to its own dataset separate from financial and death rates. This is necessary because the areas affected consist of multiple inconsistently named and formatted variables in each cell of the feature.

This data is now tidy, but undergoes some additional cleaning further down.

data3 <- data3 |> 
  select(!Deaths :REf) |> 
  separate_longer_delim(
   Areas.affected, delim = regex("\\w+[a-z][A-Z]\\w+|, | and")
  ) |> 
  rename(area_affected = Areas.affected)

head(data3)
## # A tibble: 6 × 11
##   Name  start_date end_date   duration_days date_footnote wind_speed_mph
##   <chr> <date>     <date>             <dbl> <chr>         <chr>         
## 1 One   1921-06-16 1921-06-26            10 ""            90            
## 2 One   1921-06-16 1921-06-26            10 ""            90            
## 3 Two   1921-09-04 1921-09-08             4 ""            80            
## 4 Four  1921-09-08 1921-09-14             6 ""            90            
## 5 Three 1922-09-18 1922-09-24             6 ""            80            
## 6 Four  1923-09-10 1923-09-13             3 ""            80            
## # ℹ 5 more variables: wind_speed_km_per_hr <chr>, pressure_hPa <chr>,
## #   pressure_inHg <chr>, area_affected <chr>, Category <int>

The following code attempts to clean and transform the areas affected data so that it is consistent. In a true analysis it would be important to explore and study and define more about how the areas affected are structured and what each area actually means. The data, as is, is very inconsistent. It uses different terms for what sound like the same region.

data3 <- data3 |> 
  mutate(
    area_affected = str_remove_all(area_affected, "^\\s|\\s$"),
   area_affected = str_replace_all(area_affected, "Caicos$", "Caicos Islands"),
   area_affected = str_replace_all(area_affected, "Atlantic Canada", "Canada, Atlantic"),
   area_affected = str_replace_all(area_affected, "Central Mexico", "Mexico, Central"),
   area_affected = str_replace_all(area_affected, "Canadian", "Canada,"),
   area_affected = str_replace_all(area_affected, "Cape Verde$", "Cape Verde Islands"),
   area_affected = str_replace_all(area_affected, "Cayman $", "Cayman Islands"),
   area_affected = str_replace_all(area_affected, "Central United States|central United States", "United States, Central"),
   area_affected = str_replace_all(area_affected, "East Coast of the United States|Eastern Coast of the United States|Eastern United States|United States East Coast|United States East coast", "United States, Eastern"),
   area_affected = str_replace_all(area_affected, "East Coast of the United States|Eastern Coast of the United States|Eastern United States|eastern United States", "United States, Eastern"),
   area_affected = str_replace_all(area_affected, "Eastern Canada", "Canada, Eastern"),
   area_affected = str_replace_all(area_affected, "Greater Antilles", "Antilles, Greater"),
   area_affected = str_replace_all(area_affected, "Gulf Coast of the United States|United States Gulf Coast", "United States, Gulf Coast"),
   area_affected = str_replace_all(area_affected, "Gulf of Mexico", "Mexico, Gulf"),
   area_affected = str_replace_all(area_affected, "Leeward$", "Antilles, Leeward"),
   area_affected = str_replace_all(area_affected, "Great Britain", "United Kingdom"),
   area_affected = str_replace_all(area_affected, "Leeward Antilles|Leeward Islands", "Antilles, Leeward"),
   area_affected = str_replace_all(area_affected, "Leeward Antilles|Leeward Islands|Lesser Antilles", "Antilles, Leeward"),
   area_affected = str_replace_all(area_affected, "Mid-Atlantic|Mid-Atlantic States|Mid-Atlantic states", "United States, Mid-Atlantic"),
   area_affected = str_replace_all(area_affected, "Midwestern Unites States", "United States, Midwestern"),
   area_affected = str_replace_all(area_affected, "Northeastern Caribbean", "Caribbean, Northeastern"),
   area_affected = str_replace_all(area_affected, "Northeastern United States", "United States, Northeastern"),
   area_affected = str_replace_all(area_affected, "South Florida", "Florida, South"),
   area_affected = str_replace_all(area_affected, "South Texas|Southern Texas", "Texas, South"),
   area_affected = str_replace_all(area_affected, "South United States, Central", "United States, South Central"),
   area_affected = str_replace_all(area_affected, "Southeast Mexico", "Mexico, Southeast"),
   area_affected = str_replace_all(area_affected, "Southeastern United States", "Unite Sates, Southeastern"),
   area_affected = str_replace_all(area_affected, "Southern Portugal", "Portugal, Southern"),
   area_affected = str_replace_all(area_affected, "Southwestern Florida", "Florida, Southwestern"),
   area_affected = str_replace_all(area_affected, "Southwestern Quebec", "Quebec, Southwestern"),
   area_affected = str_replace_all(area_affected, "Northeastern Caribbean", "Caribbean, Northeastern"),
   area_affected = str_replace_all(area_affected, "The Bahamas", "Bahamas"),
   area_affected = str_replace_all(area_affected, "The Caribbean", "Caribbean"),
   area_affected = str_replace_all(area_affected, "The Carolinas", "Carolinas"),
   area_affected = str_replace_all(area_affected, "Northeastern Caribbean", "Caribbean, Northeastern"),
   area_affected = str_replace_all(area_affected, "West Africa", "Africa, West"),
   area_affected = str_replace_all(area_affected, "Western Europe", "Europe, Western"),
   area_affected = str_replace_all(area_affected, "Western Mexico", "Mexico, Western"),
   area_affected = str_replace_all(area_affected, "^Yucatá.*", "Yucatán Peninsula"),
   area_affected = str_replace_all(area_affected, "western Cuba", "Cuba, Western")
  )

head(data3)
## # A tibble: 6 × 11
##   Name  start_date end_date   duration_days date_footnote wind_speed_mph
##   <chr> <date>     <date>             <dbl> <chr>         <chr>         
## 1 One   1921-06-16 1921-06-26            10 ""            90            
## 2 One   1921-06-16 1921-06-26            10 ""            90            
## 3 Two   1921-09-04 1921-09-08             4 ""            80            
## 4 Four  1921-09-08 1921-09-14             6 ""            90            
## 5 Three 1922-09-18 1922-09-24             6 ""            80            
## 6 Four  1923-09-10 1923-09-13             3 ""            80            
## # ℹ 5 more variables: wind_speed_km_per_hr <chr>, pressure_hPa <chr>,
## #   pressure_inHg <chr>, area_affected <chr>, Category <int>

The following code replaces “Unknown” deaths with NA in the data_damage dataframe. In order to provide more consistency during analysis and not have any non-numeric data within the column.

This dataset is now tidy as well.

data_damage <- data_damage |> 
  mutate(
  Deaths = na_if(Deaths, "Unknown")
    )

head(data_damage)
## # A tibble: 6 × 14
##   Name  start_date end_date   duration_days date_footnote wind_speed_mph
##   <chr> <date>     <date>             <dbl> <chr>         <chr>         
## 1 One   1921-06-16 1921-06-26            10 ""            90            
## 2 Two   1921-09-04 1921-09-08             4 ""            80            
## 3 Four  1921-09-08 1921-09-14             6 ""            90            
## 4 Three 1922-09-18 1922-09-24             6 ""            80            
## 5 Four  1923-09-10 1923-09-13             3 ""            80            
## 6 Six   1923-10-12 1923-10-17             5 ""            80            
## # ℹ 8 more variables: wind_speed_km_per_hr <chr>, pressure_hPa <chr>,
## #   pressure_inHg <chr>, Deaths <chr>, damage_notes <chr>,
## #   damage_amount_dollars <dbl>, damage_modifier <chr>, Category <int>

The following code block does some additional cleanup to both data frames. It renames column names and eliminates a helper column.

data_damage <- data_damage |> 
  relocate(damage_notes, .after = damage_amount_dollars) |> 
  mutate(death_notes = if_else(str_detect(Deaths, ">"), "deaths are greater than", NA_character_),
         death_notes = if_else(str_detect(Deaths, "None"), Deaths, death_notes),
    Deaths = str_remove_all(Deaths, "[\\W]*"),
    Deaths = str_remove_all(Deaths, "None"),
    Deaths = as.numeric(Deaths),
    wind_speed_mph = as.numeric(wind_speed_mph),
    wind_speed_km_per_hr = as.numeric(wind_speed_km_per_hr),
    pressure_hPa = as.numeric(pressure_hPa),
    pressure_inHg = as.numeric(pressure_inHg)
  ) |> 
  select(!damage_modifier) |>
  rename(deaths = Deaths, name = Name) |> 
  relocate(death_notes, .after = deaths)

data3 <- data3 |> 
  mutate(
    wind_speed_mph = as.numeric(wind_speed_mph),
    wind_speed_km_per_hr = as.numeric(wind_speed_km_per_hr),
    pressure_hPa = as.numeric(pressure_hPa),
    pressure_inHg = as.numeric(pressure_inHg)
  ) |> 
  rename(name = Name)

head(data3)
## # A tibble: 6 × 11
##   name  start_date end_date   duration_days date_footnote wind_speed_mph
##   <chr> <date>     <date>             <dbl> <chr>                  <dbl>
## 1 One   1921-06-16 1921-06-26            10 ""                        90
## 2 One   1921-06-16 1921-06-26            10 ""                        90
## 3 Two   1921-09-04 1921-09-08             4 ""                        80
## 4 Four  1921-09-08 1921-09-14             6 ""                        90
## 5 Three 1922-09-18 1922-09-24             6 ""                        80
## 6 Four  1923-09-10 1923-09-13             3 ""                        80
## # ℹ 5 more variables: wind_speed_km_per_hr <dbl>, pressure_hPa <dbl>,
## #   pressure_inHg <dbl>, area_affected <chr>, Category <int>
head(data_damage)
## # A tibble: 6 × 14
##   name  start_date end_date   duration_days date_footnote wind_speed_mph
##   <chr> <date>     <date>             <dbl> <chr>                  <dbl>
## 1 One   1921-06-16 1921-06-26            10 ""                        90
## 2 Two   1921-09-04 1921-09-08             4 ""                        80
## 3 Four  1921-09-08 1921-09-14             6 ""                        90
## 4 Three 1922-09-18 1922-09-24             6 ""                        80
## 5 Four  1923-09-10 1923-09-13             3 ""                        80
## 6 Six   1923-10-12 1923-10-17             5 ""                        80
## # ℹ 8 more variables: wind_speed_km_per_hr <dbl>, pressure_hPa <dbl>,
## #   pressure_inHg <dbl>, deaths <dbl>, death_notes <chr>,
## #   damage_amount_dollars <dbl>, damage_notes <chr>, Category <int>
head(data_ref)
## # A tibble: 6 × 4
##   name  start_date end_date   ref  
##   <chr> <date>     <date>     <chr>
## 1 One   1921-06-16 1921-06-26 2    
## 2 Two   1921-09-04 1921-09-08 2    
## 3 Four  1921-09-08 1921-09-14 2    
## 4 Three 1922-09-18 1922-09-24 2    
## 5 Four  1923-09-10 1923-09-13 2    
## 6 Six   1923-10-12 1923-10-17 2

Both dataframes produced from the data set are now tidy and clean and ready for analysis.

Analysis

Below is a cursory analysis of the now tidy data.

The following code sorts and displays, then charts the Hurricanes by the ones that cost the most human life.

data_deaths <- data_damage |> 
  arrange(desc(deaths)) |>
  distinct(name,.keep_all = TRUE) |> 
  slice_head(n = 10)
ggplot(data_deaths, aes(x = reorder(name, -deaths),y = deaths)) + geom_col()

The following code displays then plots the top ten hurricanes by the estimated cost of damage for all huricanes that had data.

data_cost <- data_damage |> 
  arrange(desc(damage_amount_dollars)) |>
  distinct(name,.keep_all = TRUE) |> 
  slice_head(n = 10)
ggplot(data_cost, aes(x = reorder(name, -damage_amount_dollars), y = damage_amount_dollars)) + geom_col()

The following code displays then plots the top ten areas that have the most hurricanes in the dataset

data_areas_affected <- data3 |> 
  filter(area_affected != "None") |>
  count(area_affected, sort = TRUE) |>
  top_n(10) |> 
  print()
## Selecting by n
## # A tibble: 11 × 2
##    area_affected                 n
##    <chr>                     <int>
##  1 Bermuda                      57
##  2 Bahamas                      52
##  3 United States, Eastern       52
##  4 Florida                      51
##  5 Cuba                         45
##  6 Antilles, Leeward            39
##  7 Canada, Atlantic             37
##  8 Mexico                       37
##  9 Puerto Rico                  27
## 10 Newfoundland                 25
## 11 United States, Gulf Coast    25
ggplot(data_areas_affected, aes(x = reorder(area_affected, -n), y = n)) + geom_col()