#Libraries and Load Data
library(readr)
## Warning: package 'readr' was built under R version 4.5.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
setwd("~/Desktop/datasets/")
earthquakes <- read_csv("earthquakes.csv")
## Rows: 123 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): month, area, region
## dbl (4): year, day, richter, deaths
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Intro and question
Question: Which continent has the most deadly earthquakes?
The dataset “Earthquakes” is a select set of notable earthquakes from the years 1900-1999. It contains A data frame with 123 rows and 7 variables, and comes from World Almanac and Book of Facts: 2011. The dataset contains variables describing different elements of the earthquakes, each defined below:
“year” - discrete, quantitative variable that contains what year the earthquake took place
“month” - discrete, quantitative variable that contains what month the earthquake took place
“day” - discrete, quantitative variable that contains what day the earthquake took place
“richter” - continuous, quantitative variable that contains the severity of the earthquake according to the richter scale
“area” - a nominal, qualitative variable that contains what city the earthquake took place in
“region” - a nominal, qualitative variable that contains what country the earthquake took place in
“deaths” - discrete, quantitative variable that contains how many deaths the earthquake caused
#Overview To answer the question of which continent has the most
deadly earthquakes, I will focus on the deaths and region variables. I
will preform an Exploratory Data Analysis by checking
structure/head/tail and checking (and handling) na values. Then I will
create a new dataframe named “c_earthquakes”, combining elements in the
region and deathrates variables to create a new variable named
“continents” and another for the deaths per continent. Finally, I will
contruct a bar chart displaying c_earthquakes and summarize my
findings.
bar chart
#Read and Clean Data
str(earthquakes)
## spc_tbl_ [123 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ year : num [1:123] 1902 1902 1903 1903 1905 ...
## $ month : chr [1:123] "April" "December" "April" "May" ...
## $ day : num [1:123] 19 16 28 28 4 31 16 18 17 21 ...
## $ richter: num [1:123] 7.5 6.4 7 5.8 7.5 8.8 6.8 7.7 8.6 8.1 ...
## $ area : chr [1:123] "Quezaltenango and San Marco" "Uzbekistan" "Malazgirt" "Gole" ...
## $ region : chr [1:123] "Guatemala" "Russia" "Turkey" "Turkey" ...
## $ deaths : num [1:123] 2000 4700 3500 1000 19000 ...
## - attr(*, "spec")=
## .. cols(
## .. year = col_double(),
## .. month = col_character(),
## .. day = col_double(),
## .. richter = col_double(),
## .. area = col_character(),
## .. region = col_character(),
## .. deaths = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#the dimensions of the dataset is 123 x 7. year, day, richter, and deaths belong to a numerical class. month, area, and region belong to a character class.
head(earthquakes)
## # A tibble: 6 × 7
## year month day richter area region deaths
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1902 April 19 7.5 Quezaltenango and San Marco Guatemala 2000
## 2 1902 December 16 6.4 Uzbekistan Russia 4700
## 3 1903 April 28 7 Malazgirt Turkey 3500
## 4 1903 May 28 5.8 Gole Turkey 1000
## 5 1905 April 4 7.5 Kangra India 19000
## 6 1906 January 31 8.8 Esmeraldas (off coast) Ecuador 1000
tail(earthquakes)
## # A tibble: 6 × 7
## year month day richter area region deaths
## <dbl> <chr> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1998 February 4 5.9 Hindu Kush Afghanistan 2323
## 2 1998 May 30 6.6 Border Afghanistan-Tajikistan 4000
## 3 1998 July 17 7 Papua New Guinea 2183
## 4 1999 January 25 6.1 Armenia Colombia 1185
## 5 1999 August 17 7.6 Izmit Turkey 17118
## 6 1999 September 20 7.6 Taichung Taiwan 2400
#top and bottom of data are normal. the data is in chronological order according to the year
sum(is.na(earthquakes$deaths))
## [1] 2
sum(is.na(earthquakes$region))
## [1] 1
#There are 2 NA values in the deaths column. There is 1 NA value in the region column
earthquakes <- earthquakes |>
filter(!is.na(region) & !is.na(deaths))
#NAs removed from the needed columns
Renaming Region Values
#check all of the countries in "region"
unique(earthquakes$region)
## [1] "Guatemala" "Russia" "Turkey"
## [4] "India" "Ecuador" "Taiwan"
## [7] "United States" "Chile" "Asia"
## [10] "Italy" "Iran" "China"
## [13] "Japan" "Nicaragua" "Armenia-Azerbaijan"
## [16] "India-Nepal" "Pakistan" "Romania"
## [19] "Argentina" "Peru" "Turkmenistan"
## [22] "Tajikistan" "Algeria" "Afghanistan"
## [25] "Morocco" "Yugoslavia" "Chine"
## [28] "New Guinea" "Philippines" "Iran-Turkey"
## [31] "Mexico" "El Salvador" "Colombia-Ecuador"
## [34] "Armenia" "Pakistan-Afghanistan" "Indonesia"
## [37] "Colombia" "Afghanistan-Tajikistan"
#use case_when() function to replace each name with its continent
earthquakes <- earthquakes |>
mutate(continent = case_when(
region %in% c("Guatemala", "Ecuador", "United States", "Chile", "Mexico", "Colombia", "El Salvador", "Colombia-Ecuador", "Peru", "Nicaragua", "Argentina") ~ "Americas",
region %in% c("Italy", "Romania", "Russia", "Yugoslavia") ~ "Europe",
region %in% c("India", "China", "Japan", "Iran", "Turkey", "Pakistan", "Taiwan", "Indonesia", "Pakistan-Afghanistan", "Philippines", "Armenia", "Afghanistan-Tajikistan", "Afghanistan", "India-Nepal", "Iran-Turkey", "Chine", "Tajikistan", "Armenia-Azerbaijan", "Asia", "Turkmenistan") ~ "Asia",
region %in% c("Morocco", "Algeria") ~ "Africa",
region %in% c("New Guinea") ~ "Oceania"
))
#New dataset
#create new dataset and eliminate repeating values by grouping continents and calculating the sum of the deaths according to each group
c_earthquakes <- earthquakes |>
group_by(continent) |>
summarise(
deaths = sum(deaths),
)
#Bar Chart
ggplot(c_earthquakes, aes(x = continent, y = deaths, fill = continent)) +
geom_col() +
theme_classic() +
labs(title = "Deaths caused by Earthquakes per Continent", x = "Continent", y = "Deaths")
#Conclusion
In conclusion, the bar chart shows that Asia has the highest cumalative deaths for earthquakes. The Americas have the second highest amount of deaths, and Europe, Africa, and Oceania have the least (greatest to least). I can conclude that Asia has the deadliest earthquakes because it has more earthquake-related deaths than all other continents. However, I also learned from my analysis that Asia has a very large number of countries compared to other continents. If I were to conduct do further analysis, I would facter in other variables such as richter to help eliminate bias produced from the difference in population sizes. Another way I could to do this is by joining the dataset and adding a population (control) variable, so I can calculate the ratio of deaths/population and compare it across continents to get a more accurate comparison.