#Libraries and Load Data

library(readr)
## Warning: package 'readr' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
setwd("~/Desktop/datasets/")
earthquakes <- read_csv("earthquakes.csv")
## Rows: 123 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): month, area, region
## dbl (4): year, day, richter, deaths
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Intro and question

Question: Which continent has the most deadly earthquakes?

The dataset “Earthquakes” is a select set of notable earthquakes from the years 1900-1999. It contains A data frame with 123 rows and 7 variables, and comes from World Almanac and Book of Facts: 2011. The dataset contains variables describing different elements of the earthquakes, each defined below:

“year” - discrete, quantitative variable that contains what year the earthquake took place

“month” - discrete, quantitative variable that contains what month the earthquake took place

“day” - discrete, quantitative variable that contains what day the earthquake took place

“richter” - continuous, quantitative variable that contains the severity of the earthquake according to the richter scale

“area” - a nominal, qualitative variable that contains what city the earthquake took place in

“region” - a nominal, qualitative variable that contains what country the earthquake took place in

“deaths” - discrete, quantitative variable that contains how many deaths the earthquake caused

#Overview To answer the question of which continent has the most deadly earthquakes, I will focus on the deaths and region variables. I will preform an Exploratory Data Analysis by checking structure/head/tail and checking (and handling) na values. Then I will create a new dataframe named “c_earthquakes”, combining elements in the region and deathrates variables to create a new variable named “continents” and another for the deaths per continent. Finally, I will contruct a bar chart displaying c_earthquakes and summarize my findings.
bar chart

#Read and Clean Data

str(earthquakes)
## spc_tbl_ [123 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ year   : num [1:123] 1902 1902 1903 1903 1905 ...
##  $ month  : chr [1:123] "April" "December" "April" "May" ...
##  $ day    : num [1:123] 19 16 28 28 4 31 16 18 17 21 ...
##  $ richter: num [1:123] 7.5 6.4 7 5.8 7.5 8.8 6.8 7.7 8.6 8.1 ...
##  $ area   : chr [1:123] "Quezaltenango and San Marco" "Uzbekistan" "Malazgirt" "Gole" ...
##  $ region : chr [1:123] "Guatemala" "Russia" "Turkey" "Turkey" ...
##  $ deaths : num [1:123] 2000 4700 3500 1000 19000 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   year = col_double(),
##   ..   month = col_character(),
##   ..   day = col_double(),
##   ..   richter = col_double(),
##   ..   area = col_character(),
##   ..   region = col_character(),
##   ..   deaths = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
#the dimensions of the dataset is 123 x 7. year, day, richter, and deaths belong to a numerical class. month, area, and region belong to a character class. 

head(earthquakes)
## # A tibble: 6 × 7
##    year month      day richter area                        region    deaths
##   <dbl> <chr>    <dbl>   <dbl> <chr>                       <chr>      <dbl>
## 1  1902 April       19     7.5 Quezaltenango and San Marco Guatemala   2000
## 2  1902 December    16     6.4 Uzbekistan                  Russia      4700
## 3  1903 April       28     7   Malazgirt                   Turkey      3500
## 4  1903 May         28     5.8 Gole                        Turkey      1000
## 5  1905 April        4     7.5 Kangra                      India      19000
## 6  1906 January     31     8.8 Esmeraldas (off coast)      Ecuador     1000
tail(earthquakes)
## # A tibble: 6 × 7
##    year month       day richter area       region                 deaths
##   <dbl> <chr>     <dbl>   <dbl> <chr>      <chr>                   <dbl>
## 1  1998 February      4     5.9 Hindu Kush Afghanistan              2323
## 2  1998 May          30     6.6 Border     Afghanistan-Tajikistan   4000
## 3  1998 July         17     7   Papua      New Guinea               2183
## 4  1999 January      25     6.1 Armenia    Colombia                 1185
## 5  1999 August       17     7.6 Izmit      Turkey                  17118
## 6  1999 September    20     7.6 Taichung   Taiwan                   2400
#top and bottom of data are normal. the data is in chronological order according to the year

sum(is.na(earthquakes$deaths))
## [1] 2
sum(is.na(earthquakes$region))
## [1] 1
#There are 2 NA values in the deaths column. There is 1 NA value in the region column

earthquakes <- earthquakes |>
  filter(!is.na(region) & !is.na(deaths))
#NAs removed from the needed columns

Renaming Region Values

#check all of the countries in "region"
unique(earthquakes$region)
##  [1] "Guatemala"              "Russia"                 "Turkey"                
##  [4] "India"                  "Ecuador"                "Taiwan"                
##  [7] "United States"          "Chile"                  "Asia"                  
## [10] "Italy"                  "Iran"                   "China"                 
## [13] "Japan"                  "Nicaragua"              "Armenia-Azerbaijan"    
## [16] "India-Nepal"            "Pakistan"               "Romania"               
## [19] "Argentina"              "Peru"                   "Turkmenistan"          
## [22] "Tajikistan"             "Algeria"                "Afghanistan"           
## [25] "Morocco"                "Yugoslavia"             "Chine"                 
## [28] "New Guinea"             "Philippines"            "Iran-Turkey"           
## [31] "Mexico"                 "El Salvador"            "Colombia-Ecuador"      
## [34] "Armenia"                "Pakistan-Afghanistan"   "Indonesia"             
## [37] "Colombia"               "Afghanistan-Tajikistan"
#use case_when() function to replace each name with its continent 🫩
earthquakes <- earthquakes |>
  mutate(continent = case_when(
    region %in% c("Guatemala", "Ecuador", "United States", "Chile", "Mexico", "Colombia", "El Salvador", "Colombia-Ecuador", "Peru", "Nicaragua", "Argentina") ~ "Americas",
    region %in% c("Italy", "Romania", "Russia", "Yugoslavia") ~ "Europe",
    region %in% c("India", "China", "Japan", "Iran", "Turkey", "Pakistan", "Taiwan", "Indonesia", "Pakistan-Afghanistan", "Philippines", "Armenia", "Afghanistan-Tajikistan", "Afghanistan", "India-Nepal", "Iran-Turkey", "Chine", "Tajikistan", "Armenia-Azerbaijan", "Asia", "Turkmenistan") ~ "Asia",
    region %in% c("Morocco", "Algeria") ~ "Africa",
    region %in% c("New Guinea") ~ "Oceania"
  ))

#New dataset

#create new dataset and eliminate repeating values by grouping continents and calculating the sum of the deaths according to each group
c_earthquakes <- earthquakes |>
  group_by(continent) |>
  summarise(
    deaths = sum(deaths),
  )

#Bar Chart

ggplot(c_earthquakes, aes(x = continent, y = deaths, fill = continent)) +
  geom_col() +
  theme_classic() +
  labs(title = "Deaths caused by Earthquakes per Continent", x = "Continent", y = "Deaths")

#Conclusion

In conclusion, the bar chart shows that Asia has the highest cumalative deaths for earthquakes. The Americas have the second highest amount of deaths, and Europe, Africa, and Oceania have the least (greatest to least). I can conclude that Asia has the deadliest earthquakes because it has more earthquake-related deaths than all other continents. However, I also learned from my analysis that Asia has a very large number of countries compared to other continents. If I were to conduct do further analysis, I would facter in other variables such as richter to help eliminate bias produced from the difference in population sizes. Another way I could to do this is by joining the dataset and adding a population (control) variable, so I can calculate the ratio of deaths/population and compare it across continents to get a more accurate comparison.