setwd("D:/R/Udacity/EDA_Course_Materials/lesson4")
library(ggplot2)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#read two files
affected <- read.csv("indicator_earthquake affected - Data.csv", header = TRUE, row.names = 1)
killed <- read.csv("indicator_earthquake killed - Data.csv", header = TRUE, row.names = 1)
#add new columns to both dataset which represent casualties and countries
affected['casualty'] = 'affected'
affected['country'] <- row.names(affected)
killed['casualty'] = 'killed'
killed['country'] <- row.names(killed)
#making tidydata
tidy_affected <- gather(affected, year, total, -casualty, - country)
tidy_killed <- gather(killed, year, total, -casualty, - country)
#merge two datasets
data <- rbind(tidy_affected, tidy_killed)
#correct year names
data['year'] <- extract_numeric(data$year)
#adjust total column by converting into thousands and replacing 0 value with 1
data['total'] <- (data$total/1000) + 1
We can see that max value of ‘total’ is very far from mean value. when we make a plot we can see that we have two outliers.
summary(data$total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 1.00 21.13 1.00 47180.00
ggplot(aes(x=year, y = total), data = data) +
geom_point() +
scale_y_continuous(breaks = seq(0,47180,10000), lim = c(1.002,47180)) +
scale_x_continuous(breaks=seq(1970,2008,2))
## Warning: Removed 6722 rows containing missing values (geom_point).
Let’s transform data to make it dispersed. Now we can see better distribution of earthquakes in the world
ggplot(aes(x=year, y = total), data = data) +
geom_point() +
scale_y_continuous(breaks = seq(0,47180,10000), lim = c(1.002,47180)) +
scale_x_continuous(breaks=seq(1970,2008,2)) +
coord_trans(y = "log10")
## Warning: Removed 6722 rows containing missing values (geom_point).
Let’s check correlation between earthquakes and years.
cor.test(data$total, data$year)
##
## Pearson's product-moment correlation
##
## data: data$total and data$year
## t = 1.9179, df = 7564, p-value = 0.05516
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.0004868974 0.0445580908
## sample estimates:
## cor
## 0.02204679
There is no coorelation between earthquakes and years as is was expected.
When data is grouped by casualty we can see that number of people affected by earthquake is much higher than number of killed people
ggplot(aes(x=year, y = total), data = data) +
geom_point(aes(colour=casualty, group= casualty)) +
scale_y_continuous(breaks = seq(0,47180,10000), lim = c(1.002,47180)) +
scale_x_continuous(breaks=seq(1970,2008,5)) +
coord_trans(y = "log10") +
labs(title = "Earthquakes by casualty")
## Warning: Removed 6722 rows containing missing values (geom_point).
top_af <- head(data[order(-data$total),],10)
top_af
## casualty country year total
## 3705 affected China 2008 47175.203
## 1788 affected India 1988 20005.162
## 3049 affected India 2001 6342.817
## 2541 affected China 1996 5312.713
## 3457 affected Pakistan 2005 5202.338
## 620 affected Guatemala 1976 5017.000
## 65 affected Peru 1970 3299.315
## 3535 affected Indonesia 2006 3187.229
## 3220 affected China 2003 3148.259
## 2832 affected China 1999 3037.807
top_killed<-filter(data, casualty == "killed")
top_killed<-head(arrange(top_killed, -total), 10)
top_killed
## casualty country year total
## 1 killed China 1976 243.000
## 2 killed China 2008 88.556
## 3 killed Pakistan 2005 74.338
## 4 killed Peru 1970 67.823
## 5 killed Iran 1990 41.021
## 6 killed Iran 2003 27.797
## 7 killed Iran 1978 26.045
## 8 killed USSR 1988 26.000
## 9 killed Guatemala 1976 24.000
## 10 killed India 2001 21.005
Let’s create a vector with unique values which represents most seismoactive countries according two top_affected and top_killed
danger <- unique(c(top_af$country, top_killed$country))
danger
## [1] "China" "India" "Pakistan" "Guatemala" "Peru" "Indonesia"
## [7] "Iran" "USSR"
length(danger)
## [1] 8
danger_set<-filter(data, country == danger)
## Warning in c("Afghanistan", "Albania", "Algeria", "American Samoa",
## "Argentina", : длина большего объекта не является произведением длины
## меньшего объекта
Let’s plot data of most dangerous seismoactive countries
ggplot(aes(x=year, y = total), data = danger_set) +
geom_point(aes(colour=country, group= country, size = sqrt(total))) +
scale_size_area() +
coord_trans(y = "log10") +
scale_y_continuous(lim = c(1.002,47180)) +
scale_x_continuous(breaks=seq(1970,2008,5))
## Warning: Removed 38 rows containing missing values (geom_point).
We cannot see very well earthquakes’ sizes of most countries so let’s zoom in our plot and look at affected and killed seperately
ggplot(aes(x=year, y = total), data = top_af) +
geom_point(aes(colour=country, group= country, size = sqrt(total))) +
scale_size_area() +
coord_trans(y = "log10") +
scale_x_continuous(breaks=seq(1970,2008,5)) +
labs(title = "Earthquakes' size and country, by affected people in thousands")
ggplot(aes(x=year, y = total), data = top_killed) +
geom_point(aes(colour=country, group= country, size = sqrt(total))) +
scale_size_area() +
coord_trans(y = "log10") +
scale_x_continuous(breaks=seq(1970,2008,5))+
labs(title = "Earthquakes' size and country, by killed people in thousands")
country_set<-filter(data, country == danger)
## Warning in c("Afghanistan", "Albania", "Algeria", "American Samoa",
## "Argentina", : длина большего объекта не является произведением длины
## меньшего объекта
ggplot(aes(x=year, y = total), data = country_set) +
geom_point(color = "red") +
geom_smooth() +
facet_wrap(~country, ncol=4, scale = "free_y")
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.