Earthquakes: consequences of disasters

setwd("D:/R/Udacity/EDA_Course_Materials/lesson4")
library(ggplot2)
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Affected and killed by earthquakes

#read two files
affected <- read.csv("indicator_earthquake affected - Data.csv", header = TRUE, row.names = 1)
killed <- read.csv("indicator_earthquake killed - Data.csv", header = TRUE, row.names = 1)

#add new columns to both dataset which represent casualties and countries
affected['casualty'] = 'affected'
affected['country'] <- row.names(affected)
killed['casualty'] = 'killed'
killed['country'] <- row.names(killed)

#making tidydata 
tidy_affected <- gather(affected, year, total, -casualty, - country)
tidy_killed <- gather(killed, year, total, -casualty, - country)

#merge two datasets
data <- rbind(tidy_affected, tidy_killed)

#correct year names
data['year'] <- extract_numeric(data$year)

#adjust total column by converting into thousands and replacing 0 value with 1
data['total'] <- (data$total/1000) + 1

Explore total number of affected and killed people (in thousands)

We can see that max value of ‘total’ is very far from mean value. when we make a plot we can see that we have two outliers.

summary(data$total)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1.00     1.00     1.00    21.13     1.00 47180.00

ggplot(aes(x=year, y = total), data = data) +  
    geom_point() + 
    scale_y_continuous(breaks = seq(0,47180,10000), lim = c(1.002,47180)) +
    scale_x_continuous(breaks=seq(1970,2008,2))

## Warning: Removed 6722 rows containing missing values (geom_point).

Transform data

Let’s transform data to make it dispersed. Now we can see better distribution of earthquakes in the world

ggplot(aes(x=year, y = total), data = data) +
    geom_point() + 
    scale_y_continuous(breaks = seq(0,47180,10000), lim = c(1.002,47180)) +
    scale_x_continuous(breaks=seq(1970,2008,2)) +  
    coord_trans(y = "log10")

## Warning: Removed 6722 rows containing missing values (geom_point).

Let’s check correlation between earthquakes and years.

cor.test(data$total, data$year)

## 
##  Pearson's product-moment correlation
## 
## data:  data$total and data$year
## t = 1.9179, df = 7564, p-value = 0.05516
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.0004868974  0.0445580908
## sample estimates:
##        cor 
## 0.02204679

There is no coorelation between earthquakes and years as is was expected.

Earthquakes by casualty

When data is grouped by casualty we can see that number of people affected by earthquake is much higher than number of killed people

ggplot(aes(x=year, y = total), data = data) +
    geom_point(aes(colour=casualty, group= casualty)) + 
    scale_y_continuous(breaks = seq(0,47180,10000), lim = c(1.002,47180)) +
    scale_x_continuous(breaks=seq(1970,2008,5)) + 
    coord_trans(y = "log10") +
    labs(title = "Earthquakes by casualty")

## Warning: Removed 6722 rows containing missing values (geom_point).

Top 10 earthquakes by number of affected people (in thousands)

top_af <- head(data[order(-data$total),],10)
top_af

##      casualty   country year     total
## 3705 affected     China 2008 47175.203
## 1788 affected     India 1988 20005.162
## 3049 affected     India 2001  6342.817
## 2541 affected     China 1996  5312.713
## 3457 affected  Pakistan 2005  5202.338
## 620  affected Guatemala 1976  5017.000
## 65   affected      Peru 1970  3299.315
## 3535 affected Indonesia 2006  3187.229
## 3220 affected     China 2003  3148.259
## 2832 affected     China 1999  3037.807

Top 10 earthquakes by number of killed people (in thousands)

top_killed<-filter(data, casualty == "killed")
top_killed<-head(arrange(top_killed, -total), 10)
top_killed

##    casualty   country year   total
## 1    killed     China 1976 243.000
## 2    killed     China 2008  88.556
## 3    killed  Pakistan 2005  74.338
## 4    killed      Peru 1970  67.823
## 5    killed      Iran 1990  41.021
## 6    killed      Iran 2003  27.797
## 7    killed      Iran 1978  26.045
## 8    killed      USSR 1988  26.000
## 9    killed Guatemala 1976  24.000
## 10   killed     India 2001  21.005

Most seismoactive countries

Let’s create a vector with unique values which represents most seismoactive countries according two top_affected and top_killed

danger <- unique(c(top_af$country, top_killed$country))
danger

## [1] "China"     "India"     "Pakistan"  "Guatemala" "Peru"      "Indonesia"
## [7] "Iran"      "USSR"

length(danger)

## [1] 8

danger_set<-filter(data, country == danger)

## Warning in c("Afghanistan", "Albania", "Algeria", "American Samoa",
## "Argentina", : длина большего объекта не является произведением длины
## меньшего объекта

Let’s plot data of most dangerous seismoactive countries

ggplot(aes(x=year, y = total), data = danger_set) +
    geom_point(aes(colour=country, group= country, size = sqrt(total))) +
    scale_size_area() +
    coord_trans(y = "log10") +
    scale_y_continuous(lim = c(1.002,47180)) +
    scale_x_continuous(breaks=seq(1970,2008,5))

## Warning: Removed 38 rows containing missing values (geom_point).

We cannot see very well earthquakes’ sizes of most countries so let’s zoom in our plot and look at affected and killed seperately

ggplot(aes(x=year, y = total), data = top_af) +
    geom_point(aes(colour=country, group= country, size = sqrt(total))) +
    scale_size_area() +
    coord_trans(y = "log10") +
    scale_x_continuous(breaks=seq(1970,2008,5)) +
    labs(title = "Earthquakes' size and country, by affected people in thousands")

ggplot(aes(x=year, y = total), data = top_killed) +
    geom_point(aes(colour=country, group= country, size = sqrt(total))) +
    scale_size_area() +
    coord_trans(y = "log10") +
    scale_x_continuous(breaks=seq(1970,2008,5))+
    labs(title = "Earthquakes' size and country, by killed people in thousands")

country_set<-filter(data, country == danger)

## Warning in c("Afghanistan", "Albania", "Algeria", "American Samoa",
## "Argentina", : длина большего объекта не является произведением длины
## меньшего объекта

ggplot(aes(x=year, y = total), data = country_set) +
    geom_point(color = "red") +
    geom_smooth() +
    facet_wrap(~country, ncol=4, scale = "free_y")

## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.