library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data1 <- read.csv("https://raw.githubusercontent.com/mraynolds/data_607/refs/heads/main/data_607_project_2_dataset_1.csv")
head(data1)
## City Temp_Jan Temp_Feb Temp_Mar Humid_Jan Humid_Feb Humid_Mar
## 1 New York 32°F 35°F 42°F 75% 72% 68%
## 2 Los Angeles 58°F 60°F 65°F 65% 63% 60%
## 3 Chicago 28°F 30°F 40°F 80% 78% 75%
The dataset has one primary issue keeping it from being tidy. The header columns are combinations of variables, both a month and an environmental measurement (temperature and humidity).
The following code pivots the data longer while separating the column names into their separate components.
data1 <- data1 |>
pivot_longer(
cols = !City,
names_to = c("environment", "month"),
names_sep = "_",
values_to = "measurement"
)
head(data1)
## # A tibble: 6 × 4
## City environment month measurement
## <chr> <chr> <chr> <chr>
## 1 New York Temp Jan 32°F
## 2 New York Temp Feb 35°F
## 3 New York Temp Mar 42°F
## 4 New York Humid Jan 75%
## 5 New York Humid Feb 72%
## 6 New York Humid Mar 68%
month_levels <- c(
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)
The data now could use some cleaning.
The code block below does the following:
data_long_1 <- data1 |>
mutate(
environment = str_replace_all(environment,"Humid","humidity_pct"),
environment = str_replace_all(environment,"Temp","temp_f"),
measurement = parse_number(measurement),
measurement = as.numeric(measurement),
month = factor(month, levels = month_levels)) |>
rename(city = City)
head(data_long_1)
## # A tibble: 6 × 4
## city environment month measurement
## <chr> <chr> <fct> <dbl>
## 1 New York temp_f Jan 32
## 2 New York temp_f Feb 35
## 3 New York temp_f Mar 42
## 4 New York humidity_pct Jan 75
## 5 New York humidity_pct Feb 72
## 6 New York humidity_pct Mar 68
The data in the previous data frame, “data_long_1”, is still not tidy as there are two types of data in the measurement column, humidity and temperature.
The following code block pivots the data wider so that temperature and humidity are separated into their own columns.
data_tidy_1 <- data_long_1 |>
pivot_wider(
names_from = environment,
values_from = measurement
)
head(data_tidy_1)
## # A tibble: 6 × 4
## city month temp_f humidity_pct
## <chr> <fct> <dbl> <dbl>
## 1 New York Jan 32 75
## 2 New York Feb 35 72
## 3 New York Mar 42 68
## 4 Los Angeles Jan 58 65
## 5 Los Angeles Feb 60 63
## 6 Los Angeles Mar 65 60
The data is now tidy and ready for analysis.
The code block below calculates the average temperature and humidity for the dataset, first by month, and then by city.
data_tidy_1 |>
group_by(month) |>
summarise(
avg_temp_f = round(mean(temp_f),1),
avg_humidity_pct = round(mean(humidity_pct),1)
)
## # A tibble: 3 × 3
## month avg_temp_f avg_humidity_pct
## <fct> <dbl> <dbl>
## 1 Jan 39.3 73.3
## 2 Feb 41.7 71
## 3 Mar 49 67.7
data_tidy_1 |>
group_by(city) |>
summarise(
avg_temp_f = round(mean(temp_f),1),
avg_humidity_pct = round(mean(humidity_pct),1)
)
## # A tibble: 3 × 3
## city avg_temp_f avg_humidity_pct
## <chr> <dbl> <dbl>
## 1 Chicago 32.7 77.7
## 2 Los Angeles 61 62.7
## 3 New York 36.3 71.7
A plot of temperature over time.
ggplot(data_tidy_1, aes(x = month, y = temp_f, group = city, color = city)) +
geom_line()
A plot of humidty over time.
ggplot(data_tidy_1, aes(x = month, y = humidity_pct, group = city, color = city)) +
geom_line()
# Plot of temp and humidity together
ggplot(data_tidy_1, aes(x = month)) + geom_line(aes(y = humidity_pct, group = city, color = city)) +
geom_line(aes(y = temp_f, group = city, color = city)) +
scale_y_continuous(name = "Percent Humidity", sec.axis = sec_axis(~., name = "Temperature in Fahrenheit"))
data2 <- read.csv("https://raw.githubusercontent.com/mraynolds/data_607/refs/heads/main/data_607_project_2_dataset_2.csv")
head(data2)
## Store.ID Store Country January February March April
## 1 1 Palisades US $371,700.00 $435,950.00 $372,460.00 $192,260.00
## 2 2 Billings US $97,530.00 $324,140.00 $454,480.00 $36,810.00
## 3 3 Laguardia US $346,130.00 $157,510.00 $288,990.00 $358,190.00
## 4 4 Cheeseburger US $442,010.00 $212,390.00 $183,580.00 $308,650.00
## 5 5 Detroit US $33,250.00 $36,840.00 $320,170.00 $242,650.00
## 6 6 Towns US $16,632.00 $25,372.00 $38,178.00 $13,222.00
## May June July August September October
## 1 $157,550.00 $332,550.00 $89,630.00 $372,090.00 $421,670.00 $173,010.00
## 2 $219,790.00 $210,970.00 $84,840.00 $175,440.00 $283,710.00 $275,320.00
## 3 $96,860.00 $461,950.00 $80,440.00 $404,990.00 $450,630.00 $327,270.00
## 4 $184,340.00 $156,540.00 $328,180.00 $281,430.00 $498,150.00 $473,150.00
## 5 $350,300.00 $421,980.00 $307,190.00 $16,900.00 $443,990.00 $346,230.00
## 6 $16,031.00 $37,162.00 $22,541.00 $16,700.00 $10,475.00 $1,897.00
## November
## 1 $173,220.00
## 2 $401,940.00
## 3 $370,100.00
## 4 $23,740.00
## 5 $312,670.00
## 6 $10,493.00
The dataset appears to be list of store ids, their location, their country, and a monetary amount listed by month in a wide format.
To start the data will need to be pivoted longer in order take the months out of the column headers and place them into rows.
The following code pivots the data longer into a column for the months and a column named “dollars”, since it is not clear what the monetary data represents and the only indicator is a dollar symbol.
A note on the money: In reality, this would require an investigation into the dataset and the domain knowledge of what the money represents. Furthermore, all columns have dollar signs, but there are several countries listed, including the US and Canada which both use the dollars but different dollars. There are several more countries that use currencies other than the dollar, yet the values listed in their rows are as dollars. For the purposes of this project, we will use the assumption that the dataset is listed exclusively in US dollars since the source data does not specify.
data2 <- data2 |>
pivot_longer(
cols = !(Store.ID:Country),
names_to = "month",
values_to = "dollars"
)
head(data2)
## # A tibble: 6 × 5
## Store.ID Store Country month dollars
## <int> <chr> <chr> <chr> <chr>
## 1 1 Palisades US January $371,700.00
## 2 1 Palisades US February $435,950.00
## 3 1 Palisades US March $372,460.00
## 4 1 Palisades US April $192,260.00
## 5 1 Palisades US May $157,550.00
## 6 1 Palisades US June $332,550.00
The following code makes an ordinal factor for the months.
month_levels <- c(
"January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December"
)
The following code cleans up the data frame. It: - renames columns - changes character classes of month to factor, and dollars to a double and removes the currency symbol
data2 <- data2 |>
rename(
store_id = Store.ID,
store = Store,
country = Country
) |>
mutate(
month = factor(month, levels = month_levels),
dollars = parse_number(dollars),
store_id = as.character(store_id)
)
head(data2)
## # A tibble: 6 × 5
## store_id store country month dollars
## <chr> <chr> <chr> <fct> <dbl>
## 1 1 Palisades US January 371700
## 2 1 Palisades US February 435950
## 3 1 Palisades US March 372460
## 4 1 Palisades US April 192260
## 5 1 Palisades US May 157550
## 6 1 Palisades US June 332550
The data frame is now clean and tidy. The data is significantly longer as a result but now it is in a form that will allow it to be analyzed easily and consistently.
The code below totals all of the monetary data by store and plots and sorts it in descending order. It seems store 3 in Laguardia in the US has the highest monetary information while Chelsea in the UK has the lowest.
total_dollars_by_store <- data2 |>
group_by(store_id, store, country) |>
summarise(total_dollars = sum(dollars), .groups = "drop_last") |>
arrange(desc(total_dollars)) |>
print()
## # A tibble: 20 × 4
## # Groups: store_id, store [20]
## store_id store country total_dollars
## <chr> <chr> <chr> <dbl>
## 1 3 Laguardia US 3343060
## 2 4 Cheeseburger US 3092160
## 3 1 Palisades US 3092090
## 4 5 Detroit US 2832170
## 5 2 Billings US 2564970
## 6 20 Camembert France 1047658
## 7 8 Eh Canada 367044
## 8 9 Maple Canada 362619
## 9 19 Paolo Brazil 330761
## 10 14 Innit UK 315480
## 11 17 Steve China 308982
## 12 16 Fuji Japan 283811
## 13 10 Victoria Canada 264917
## 14 7 Tim Horton's Canada 258845
## 15 15 Hokkaido Japan 244173
## 16 13 Blimey UK 240649
## 17 18 Nanjing China 219850
## 18 6 Towns US 208703
## 19 12 Wimbledon UK 199778
## 20 11 Chelsea UK 191157
ggplot(total_dollars_by_store, aes(x = reorder(store_id, -total_dollars), y = total_dollars)) +
geom_col()
The following code adds up the monetary data by country and sorts it in descending order. It also calculates the average dollars per store by country. From this data it is clear that the US has the highest total dollars. This is to be expected as it has the most stores. However it also has the higher average dollars per store.
A notable insight from this data is that while France has only one store, it has the third highest total dollars and the second highest average dollars per store.
total_dollars_by_country <- data2 |>
group_by(country) |>
summarise(
store_per_country = n()/11,
total_dollars = sum(dollars),
avg_dollars_per_store = (total_dollars / store_per_country)
) |>
arrange(desc(total_dollars)
) |>
print()
## # A tibble: 7 × 4
## country store_per_country total_dollars avg_dollars_per_store
## <chr> <dbl> <dbl> <dbl>
## 1 US 6 15133153 2522192.
## 2 Canada 4 1253425 313356.
## 3 France 1 1047658 1047658
## 4 UK 4 947064 236766
## 5 China 2 528832 264416
## 6 Japan 2 527984 263992
## 7 Brazil 1 330761 330761
ggplot(total_dollars_by_country, aes(x = reorder(country, -total_dollars), y = total_dollars)) +
geom_col()
ggplot(total_dollars_by_country, aes(x = reorder(country, -avg_dollars_per_store), y = avg_dollars_per_store)) +
geom_col()
The following code totals the dollars by month. It seems Spetemeber has the highest dollar total while July has the lowest.
dollars_by_month <- data2 |>
group_by(month) |>
summarise(
total_dollars = sum(dollars)
) |>
arrange(desc(total_dollars)) |>
print()
## # A tibble: 11 × 2
## month total_dollars
## <fct> <dbl>
## 1 September 2568518
## 2 November 2040795
## 3 March 1995192
## 4 June 1993274
## 5 October 1955800
## 6 February 1715421
## 7 January 1694355
## 8 August 1612337
## 9 April 1580057
## 10 May 1368979
## 11 July 1244149
ggplot(dollars_by_month, aes(x = month, y = total_dollars)) +
geom_col()
The following code loads the data set into r and fills NA into empty cells.
data3 <- read.csv("https://raw.githubusercontent.com/mraynolds/data_607/refs/heads/main/data_607_project_2_dataset_3.csv", na = c("","NA"))
head(data3)
## X Name Duration Wind.speed Pressure
## 1 0 One June 16 – 26, 1921 90 mph (150 km/h) 980 hPa (28.94 inHg)
## 2 1 Two September 4 – 8, 1921 80 mph (130 km/h) 985 hPa (29.09 inHg)
## 3 2 Four September 8 – 14, 1921 90 mph (150 km/h) 979 hPa (28.91 inHg)
## 4 3 Three September 18 – 24, 1922 80 mph (130 km/h) 987 hPa (29.15 inHg)
## 5 4 Four September 10 – 13, 1923 80 mph (130 km/h) 986 hPa (29.12 inHg)
## 6 5 Six October 12 – 17, 1923 80 mph (130 km/h) 983 hPa (29.03 inHg)
## Areas.affected Deaths Damage REf Category
## 1 Central America, Gulf of Mexico 0 <NA> [2] 1
## 2 Mexico 215 $19 million [2] 1
## 3 None 0 <NA> [2] 1
## 4 None 0 <NA> [2] 1
## 5 Newfoundland 0 <NA> [2] 1
## 6 Mexico, Gulf Coast of the United States 0 <NA> [2] 1
The code has several issues that keep it from being tidy, and even more issues that keep it from being clean.
To tidy the following transformations will be executed: - Separate duration into a start and end date and create an actual duration column - Separate out environmental metrics (wind speed and pressure) so that each column only has one data type and only contains numeric data - Separate and convert monetary damage into separate columns for notes and numeric values, while eliminating the text based values - Pivot long areas affected so they can be evaluated individually
Most columns also need some additional cleaning.
The code block below begins separating out the date data. The date data is mostly a range of two dates, with very inconsistent formatting. The code below separates the original duration column into individual components for the start month, year, day and the end month, year, day, and a footnote column. The original dataset did not contain the actual footnote.
data3 <- data3 |>
select(!X) |>
separate_wider_regex(
Duration,
patterns = c(
start_month = "[A-Za-z]+",
"\\s*",
start_day = "[0-9]+",
",*",
"\\s*",
start_year = "[0-9]*",
"\\W*",
end_month = "[A-Za-z]*",
"\\W*",
end_day = "[0-9]*",
"\\W*",
end_year = "[0-9]*",
"\\s*",
date_footnote = ".*"
)
)
head(data3)
## # A tibble: 6 × 15
## Name start_month start_day start_year end_month end_day end_year
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 One June 16 "" "" 26 1921
## 2 Two September 4 "" "" 8 1921
## 3 Four September 8 "" "" 14 1921
## 4 Three September 18 "" "" 24 1922
## 5 Four September 10 "" "" 13 1923
## 6 Six October 12 "" "" 17 1923
## # ℹ 8 more variables: date_footnote <chr>, Wind.speed <chr>, Pressure <chr>,
## # Areas.affected <chr>, Deaths <chr>, Damage <chr>, REf <chr>, Category <int>
The following code copies the year into the start year from the end year columns wherever there is no start year. It also copies the start month into the end month column anywhere that an end month does not already exist.
data3 <- data3 |> mutate(
start_year = if_else(start_year == "", end_year, start_year),
end_month = if_else(end_month == "", start_month, end_month),
end_day = if_else(end_day == "", start_day, end_day),
end_year = if_else(end_year == "", start_year, end_year),
)
head(data3)
## # A tibble: 6 × 15
## Name start_month start_day start_year end_month end_day end_year
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 One June 16 1921 June 26 1921
## 2 Two September 4 1921 September 8 1921
## 3 Four September 8 1921 September 14 1921
## 4 Three September 18 1922 September 24 1922
## 5 Four September 10 1923 September 13 1923
## 6 Six October 12 1923 October 17 1923
## # ℹ 8 more variables: date_footnote <chr>, Wind.speed <chr>, Pressure <chr>,
## # Areas.affected <chr>, Deaths <chr>, Damage <chr>, REf <chr>, Category <int>
The following code block converts the months to their numeric equivalent and then forms individual columns that contain the full start and end date, as well as the difference between the two so that there is a duration column.
The duration column is a nice addition to the data as it automatically displays a summary of what the seconds mean in a more easily read metric.
data3 <- data3 |>
mutate(
start_month = match(start_month, month.name),
start_date = make_date(start_year, start_month, start_day),
end_month = match(end_month, month.name),
end_date = make_date(end_year, end_month, end_day),
duration_days = as.numeric(end_date - start_date)
) |>
select(!start_month:end_year) |>
relocate(start_date:duration_days, .after = Name)
head(data3)
## # A tibble: 6 × 12
## Name start_date end_date duration_days date_footnote Wind.speed Pressure
## <chr> <date> <date> <dbl> <chr> <chr> <chr>
## 1 One 1921-06-16 1921-06-26 10 "" 90 mph (150 … 980 hPa…
## 2 Two 1921-09-04 1921-09-08 4 "" 80 mph (130 … 985 hPa…
## 3 Four 1921-09-08 1921-09-14 6 "" 90 mph (150 … 979 hPa…
## 4 Three 1922-09-18 1922-09-24 6 "" 80 mph (130 … 987 hPa…
## 5 Four 1923-09-10 1923-09-13 3 "" 80 mph (130 … 986 hPa…
## 6 Six 1923-10-12 1923-10-17 5 "" 80 mph (130 … 983 hPa…
## # ℹ 5 more variables: Areas.affected <chr>, Deaths <chr>, Damage <chr>,
## # REf <chr>, Category <int>
The following code separates wide wind speed and pressure data so each column only contains a single numeric value.
data3 <- data3 |> separate_wider_regex(Wind.speed,
patterns = c(
wind_speed_mph = "[0-9]+",
"\\D+",
wind_speed_km_per_hr = "[0-9]+",
"\\D+")) |>
mutate(Pressure = str_remove_all(Pressure,",")) |>
separate_wider_regex(Pressure,
patterns = c(
pressure_hPa = "[0-9]+",
"\\D+",
pressure_inHg = "[0-9+\\.0-9]+",
"\\D+"), too_few = "align_start")
head(data3)
## # A tibble: 6 × 14
## Name start_date end_date duration_days date_footnote wind_speed_mph
## <chr> <date> <date> <dbl> <chr> <chr>
## 1 One 1921-06-16 1921-06-26 10 "" 90
## 2 Two 1921-09-04 1921-09-08 4 "" 80
## 3 Four 1921-09-08 1921-09-14 6 "" 90
## 4 Three 1922-09-18 1922-09-24 6 "" 80
## 5 Four 1923-09-10 1923-09-13 3 "" 80
## 6 Six 1923-10-12 1923-10-17 5 "" 80
## # ℹ 8 more variables: wind_speed_km_per_hr <chr>, pressure_hPa <chr>,
## # pressure_inHg <chr>, Areas.affected <chr>, Deaths <chr>, Damage <chr>,
## # REf <chr>, Category <int>
The following code block cleans up and converts the damages information into numeric values. The formatting is very inconsistent as loaded. This code makes it consistent so that it can be used in data analysis.
data3 <- data3 |>
separate_wider_regex(Damage,
patterns = c(damage_notes = "^\\[.*|>*",
"\\$?",
damage_amount_dollars = "\\$?\\s*\\d*\\.*\\d*",
"\\s*",
damage_modifier = "[A-Za-z]*"
), too_few = "align_start")
data3 <- data3 |> mutate(
damage_amount_dollars = str_remove_all(damage_amount_dollars, "\\$"),
damage_amount_dollars = as.numeric(damage_amount_dollars),
damage_amount_dollars = if_else(damage_modifier == "thousand", damage_amount_dollars*1000, damage_amount_dollars),
damage_amount_dollars = if_else(damage_modifier == "million", damage_amount_dollars*1000000, damage_amount_dollars),
damage_amount_dollars = if_else(damage_modifier == "billion", damage_amount_dollars*1000000000, damage_amount_dollars),
damage_notes = if_else(damage_modifier == "Unknown", "Unknown", damage_notes),
damage_notes = if_else(damage_notes == ">", "Damage greater than damage_amount", damage_notes)
)
head(data3)
## # A tibble: 6 × 16
## Name start_date end_date duration_days date_footnote wind_speed_mph
## <chr> <date> <date> <dbl> <chr> <chr>
## 1 One 1921-06-16 1921-06-26 10 "" 90
## 2 Two 1921-09-04 1921-09-08 4 "" 80
## 3 Four 1921-09-08 1921-09-14 6 "" 90
## 4 Three 1922-09-18 1922-09-24 6 "" 80
## 5 Four 1923-09-10 1923-09-13 3 "" 80
## 6 Six 1923-10-12 1923-10-17 5 "" 80
## # ℹ 10 more variables: wind_speed_km_per_hr <chr>, pressure_hPa <chr>,
## # pressure_inHg <chr>, Areas.affected <chr>, Deaths <chr>,
## # damage_notes <chr>, damage_amount_dollars <dbl>, damage_modifier <chr>,
## # REf <chr>, Category <int>
The code below saves a tidy version of the data set with the column “Areas.affected” and “REf” eliminated. This saves a now tidy data set that can be analyzed for all metrics except areas affected and REf. The areas affected will be pivoted longer later so that areas affected can be analyzed. However they are separated at this stage because when pivoted longer the financial information will be duplicated which could lead to inaccurate analysis of financial impacts. Additionally it has the same potential issue with deaths caused by the hurricanes. Since information about how deaths or financial costs were spread out over the affected areas, the most responsible thing would seem to be to separate out that data prior to pivoting the areas affected longer.
data_damage <- data3 |>
select(!Areas.affected) |>
select(!REf)
head(data_damage)
## # A tibble: 6 × 14
## Name start_date end_date duration_days date_footnote wind_speed_mph
## <chr> <date> <date> <dbl> <chr> <chr>
## 1 One 1921-06-16 1921-06-26 10 "" 90
## 2 Two 1921-09-04 1921-09-08 4 "" 80
## 3 Four 1921-09-08 1921-09-14 6 "" 90
## 4 Three 1922-09-18 1922-09-24 6 "" 80
## 5 Four 1923-09-10 1923-09-13 3 "" 80
## 6 Six 1923-10-12 1923-10-17 5 "" 80
## # ℹ 8 more variables: wind_speed_km_per_hr <chr>, pressure_hPa <chr>,
## # pressure_inHg <chr>, Deaths <chr>, damage_notes <chr>,
## # damage_amount_dollars <dbl>, damage_modifier <chr>, Category <int>
The following code separates out the names, dates and REf column. The REf column contains multiple references to footnotes (whose meaning were not included with the dataset). In order to normalize the data storage, this code will move these references to their own data frame and then separate the data longer which will allow the footnotes to be individually referenced without unnecessarily duplicating a significant amount of data in one of the other data frames. This is similar to the need to separate out the monetary damages as well as death information. In a way separating out the data is starting the process of normalizing the data in order to reduce duplication. The name and dates act as a key so the footnotes can be referenced via another dat set.
data_ref <- data3 |>
select(Name:end_date,REf) |>
rename(ref = REf, name = Name) |>
mutate(
ref = str_remove_all(ref, "\\."),
ref = str_replace_all(ref, "\\]\\[", "\\],\\[")
) |>
separate_longer_delim(ref,",") |>
mutate(
ref = str_remove_all(ref, "\\["),
ref = str_remove_all(ref, "\\]")
)
head(data_ref)
## # A tibble: 6 × 4
## name start_date end_date ref
## <chr> <date> <date> <chr>
## 1 One 1921-06-16 1921-06-26 2
## 2 Two 1921-09-04 1921-09-08 2
## 3 Four 1921-09-08 1921-09-14 2
## 4 Three 1922-09-18 1922-09-24 2
## 5 Four 1923-09-10 1923-09-13 2
## 6 Six 1923-10-12 1923-10-17 2
The following code separates longer the areas affected column and saves the data to its own dataset separate from financial and death rates. This is necessary because the areas affected consist of multiple inconsistently named and formatted variables in each cell of the feature.
This data is now tidy, but undergoes some additional cleaning further down.
data3 <- data3 |>
select(!Deaths :REf) |>
separate_longer_delim(
Areas.affected, delim = regex("\\w+[a-z][A-Z]\\w+|, | and")
) |>
rename(area_affected = Areas.affected)
head(data3)
## # A tibble: 6 × 11
## Name start_date end_date duration_days date_footnote wind_speed_mph
## <chr> <date> <date> <dbl> <chr> <chr>
## 1 One 1921-06-16 1921-06-26 10 "" 90
## 2 One 1921-06-16 1921-06-26 10 "" 90
## 3 Two 1921-09-04 1921-09-08 4 "" 80
## 4 Four 1921-09-08 1921-09-14 6 "" 90
## 5 Three 1922-09-18 1922-09-24 6 "" 80
## 6 Four 1923-09-10 1923-09-13 3 "" 80
## # ℹ 5 more variables: wind_speed_km_per_hr <chr>, pressure_hPa <chr>,
## # pressure_inHg <chr>, area_affected <chr>, Category <int>
The following code attempts to clean and transform the areas affected data so that it is consistent. In a true analysis it would be important to explore and study and define more about how the areas affected are structured and what each area actually means. The data, as is, is very inconsistent. It uses different terms for what sound like the same region.
data3 <- data3 |>
mutate(
area_affected = str_remove_all(area_affected, "^\\s|\\s$"),
area_affected = str_replace_all(area_affected, "Caicos$", "Caicos Islands"),
area_affected = str_replace_all(area_affected, "Atlantic Canada", "Canada, Atlantic"),
area_affected = str_replace_all(area_affected, "Central Mexico", "Mexico, Central"),
area_affected = str_replace_all(area_affected, "Canadian", "Canada,"),
area_affected = str_replace_all(area_affected, "Cape Verde$", "Cape Verde Islands"),
area_affected = str_replace_all(area_affected, "Cayman $", "Cayman Islands"),
area_affected = str_replace_all(area_affected, "Central United States|central United States", "United States, Central"),
area_affected = str_replace_all(area_affected, "East Coast of the United States|Eastern Coast of the United States|Eastern United States|United States East Coast|United States East coast", "United States, Eastern"),
area_affected = str_replace_all(area_affected, "East Coast of the United States|Eastern Coast of the United States|Eastern United States|eastern United States", "United States, Eastern"),
area_affected = str_replace_all(area_affected, "Eastern Canada", "Canada, Eastern"),
area_affected = str_replace_all(area_affected, "Greater Antilles", "Antilles, Greater"),
area_affected = str_replace_all(area_affected, "Gulf Coast of the United States|United States Gulf Coast", "United States, Gulf Coast"),
area_affected = str_replace_all(area_affected, "Gulf of Mexico", "Mexico, Gulf"),
area_affected = str_replace_all(area_affected, "Leeward$", "Antilles, Leeward"),
area_affected = str_replace_all(area_affected, "Great Britain", "United Kingdom"),
area_affected = str_replace_all(area_affected, "Leeward Antilles|Leeward Islands", "Antilles, Leeward"),
area_affected = str_replace_all(area_affected, "Leeward Antilles|Leeward Islands|Lesser Antilles", "Antilles, Leeward"),
area_affected = str_replace_all(area_affected, "Mid-Atlantic|Mid-Atlantic States|Mid-Atlantic states", "United States, Mid-Atlantic"),
area_affected = str_replace_all(area_affected, "Midwestern Unites States", "United States, Midwestern"),
area_affected = str_replace_all(area_affected, "Northeastern Caribbean", "Caribbean, Northeastern"),
area_affected = str_replace_all(area_affected, "Northeastern United States", "United States, Northeastern"),
area_affected = str_replace_all(area_affected, "South Florida", "Florida, South"),
area_affected = str_replace_all(area_affected, "South Texas|Southern Texas", "Texas, South"),
area_affected = str_replace_all(area_affected, "South United States, Central", "United States, South Central"),
area_affected = str_replace_all(area_affected, "Southeast Mexico", "Mexico, Southeast"),
area_affected = str_replace_all(area_affected, "Southeastern United States", "Unite Sates, Southeastern"),
area_affected = str_replace_all(area_affected, "Southern Portugal", "Portugal, Southern"),
area_affected = str_replace_all(area_affected, "Southwestern Florida", "Florida, Southwestern"),
area_affected = str_replace_all(area_affected, "Southwestern Quebec", "Quebec, Southwestern"),
area_affected = str_replace_all(area_affected, "Northeastern Caribbean", "Caribbean, Northeastern"),
area_affected = str_replace_all(area_affected, "The Bahamas", "Bahamas"),
area_affected = str_replace_all(area_affected, "The Caribbean", "Caribbean"),
area_affected = str_replace_all(area_affected, "The Carolinas", "Carolinas"),
area_affected = str_replace_all(area_affected, "Northeastern Caribbean", "Caribbean, Northeastern"),
area_affected = str_replace_all(area_affected, "West Africa", "Africa, West"),
area_affected = str_replace_all(area_affected, "Western Europe", "Europe, Western"),
area_affected = str_replace_all(area_affected, "Western Mexico", "Mexico, Western"),
area_affected = str_replace_all(area_affected, "^Yucatá.*", "Yucatán Peninsula"),
area_affected = str_replace_all(area_affected, "western Cuba", "Cuba, Western")
)
head(data3)
## # A tibble: 6 × 11
## Name start_date end_date duration_days date_footnote wind_speed_mph
## <chr> <date> <date> <dbl> <chr> <chr>
## 1 One 1921-06-16 1921-06-26 10 "" 90
## 2 One 1921-06-16 1921-06-26 10 "" 90
## 3 Two 1921-09-04 1921-09-08 4 "" 80
## 4 Four 1921-09-08 1921-09-14 6 "" 90
## 5 Three 1922-09-18 1922-09-24 6 "" 80
## 6 Four 1923-09-10 1923-09-13 3 "" 80
## # ℹ 5 more variables: wind_speed_km_per_hr <chr>, pressure_hPa <chr>,
## # pressure_inHg <chr>, area_affected <chr>, Category <int>
The following code replaces “Unknown” deaths with NA in the data_damage dataframe. In order to provide more consistency during analysis and not have any non-numeric data within the column.
This dataset is now tidy as well.
data_damage <- data_damage |>
mutate(
Deaths = na_if(Deaths, "Unknown")
)
head(data_damage)
## # A tibble: 6 × 14
## Name start_date end_date duration_days date_footnote wind_speed_mph
## <chr> <date> <date> <dbl> <chr> <chr>
## 1 One 1921-06-16 1921-06-26 10 "" 90
## 2 Two 1921-09-04 1921-09-08 4 "" 80
## 3 Four 1921-09-08 1921-09-14 6 "" 90
## 4 Three 1922-09-18 1922-09-24 6 "" 80
## 5 Four 1923-09-10 1923-09-13 3 "" 80
## 6 Six 1923-10-12 1923-10-17 5 "" 80
## # ℹ 8 more variables: wind_speed_km_per_hr <chr>, pressure_hPa <chr>,
## # pressure_inHg <chr>, Deaths <chr>, damage_notes <chr>,
## # damage_amount_dollars <dbl>, damage_modifier <chr>, Category <int>
The following code block does some additional cleanup to both data frames. It renames column names and eliminates a helper column.
data_damage <- data_damage |>
relocate(damage_notes, .after = damage_amount_dollars) |>
mutate(death_notes = if_else(str_detect(Deaths, ">"), "deaths are greater than", NA_character_),
death_notes = if_else(str_detect(Deaths, "None"), Deaths, death_notes),
Deaths = str_remove_all(Deaths, "[\\W]*"),
Deaths = str_remove_all(Deaths, "None"),
Deaths = as.numeric(Deaths),
wind_speed_mph = as.numeric(wind_speed_mph),
wind_speed_km_per_hr = as.numeric(wind_speed_km_per_hr),
pressure_hPa = as.numeric(pressure_hPa),
pressure_inHg = as.numeric(pressure_inHg)
) |>
select(!damage_modifier) |>
rename(deaths = Deaths, name = Name) |>
relocate(death_notes, .after = deaths)
data3 <- data3 |>
mutate(
wind_speed_mph = as.numeric(wind_speed_mph),
wind_speed_km_per_hr = as.numeric(wind_speed_km_per_hr),
pressure_hPa = as.numeric(pressure_hPa),
pressure_inHg = as.numeric(pressure_inHg)
) |>
rename(name = Name)
head(data3)
## # A tibble: 6 × 11
## name start_date end_date duration_days date_footnote wind_speed_mph
## <chr> <date> <date> <dbl> <chr> <dbl>
## 1 One 1921-06-16 1921-06-26 10 "" 90
## 2 One 1921-06-16 1921-06-26 10 "" 90
## 3 Two 1921-09-04 1921-09-08 4 "" 80
## 4 Four 1921-09-08 1921-09-14 6 "" 90
## 5 Three 1922-09-18 1922-09-24 6 "" 80
## 6 Four 1923-09-10 1923-09-13 3 "" 80
## # ℹ 5 more variables: wind_speed_km_per_hr <dbl>, pressure_hPa <dbl>,
## # pressure_inHg <dbl>, area_affected <chr>, Category <int>
head(data_damage)
## # A tibble: 6 × 14
## name start_date end_date duration_days date_footnote wind_speed_mph
## <chr> <date> <date> <dbl> <chr> <dbl>
## 1 One 1921-06-16 1921-06-26 10 "" 90
## 2 Two 1921-09-04 1921-09-08 4 "" 80
## 3 Four 1921-09-08 1921-09-14 6 "" 90
## 4 Three 1922-09-18 1922-09-24 6 "" 80
## 5 Four 1923-09-10 1923-09-13 3 "" 80
## 6 Six 1923-10-12 1923-10-17 5 "" 80
## # ℹ 8 more variables: wind_speed_km_per_hr <dbl>, pressure_hPa <dbl>,
## # pressure_inHg <dbl>, deaths <dbl>, death_notes <chr>,
## # damage_amount_dollars <dbl>, damage_notes <chr>, Category <int>
head(data_ref)
## # A tibble: 6 × 4
## name start_date end_date ref
## <chr> <date> <date> <chr>
## 1 One 1921-06-16 1921-06-26 2
## 2 Two 1921-09-04 1921-09-08 2
## 3 Four 1921-09-08 1921-09-14 2
## 4 Three 1922-09-18 1922-09-24 2
## 5 Four 1923-09-10 1923-09-13 2
## 6 Six 1923-10-12 1923-10-17 2
Both dataframes produced from the data set are now tidy and clean and ready for analysis.
Below is a cursory analysis of the now tidy data.
The following code sorts and displays, then charts the Hurricanes by the ones that cost the most human life.
data_deaths <- data_damage |>
arrange(desc(deaths)) |>
distinct(name,.keep_all = TRUE) |>
slice_head(n = 10)
ggplot(data_deaths, aes(x = reorder(name, -deaths),y = deaths)) + geom_col()
The following code displays then plots the top ten hurricanes by the
estimated cost of damage for all huricanes that had data.
data_cost <- data_damage |>
arrange(desc(damage_amount_dollars)) |>
distinct(name,.keep_all = TRUE) |>
slice_head(n = 10)
ggplot(data_cost, aes(x = reorder(name, -damage_amount_dollars), y = damage_amount_dollars)) + geom_col()
The following code displays then plots the top ten areas that have the most hurricanes in the dataset
data_areas_affected <- data3 |>
filter(area_affected != "None") |>
count(area_affected, sort = TRUE) |>
top_n(10) |>
print()
## Selecting by n
## # A tibble: 11 × 2
## area_affected n
## <chr> <int>
## 1 Bermuda 57
## 2 Bahamas 52
## 3 United States, Eastern 52
## 4 Florida 51
## 5 Cuba 45
## 6 Antilles, Leeward 39
## 7 Canada, Atlantic 37
## 8 Mexico 37
## 9 Puerto Rico 27
## 10 Newfoundland 25
## 11 United States, Gulf Coast 25
ggplot(data_areas_affected, aes(x = reorder(area_affected, -n), y = n)) + geom_col()