options(width=100)
knitr::opts_chunk$set(out.width='1000px',dpi=200,message=FALSE,warning=FALSE)
#load packages and csv file
library(ggplot2)
library(dplyr)
library(gridExtra)
library(Amelia)
library(ggmap)
library(gganimate)
library(animation)

Data & quick Intro

The dataset represents 2,000 source events from a Tsunami database with information about the geolocations of each event, magnitude of the Tsunami, injuries and damages, from -2000 BC to present.

df<-read.csv('sources.csv',sep=',')
#str(df)

There are a lot of empty columns/rows or NA :

#missmap(df, main="Missings Map", col=c("yellow", "black"),y.cex = 0.4, x.cex = 0.6, legend=FALSE)

But as I will look at the events vs time / geolocations, the removal of empty rows will not be too big.

Map vs. INTENSITY_SOLOVIEV
df2<-as.data.frame(df %>% select(YEAR,CAUSE,MONTH,COUNTRY,LOCATION,LATITUDE,LONGITUDE,INTENSITY_SOLOVIEV))

The Intensity_Voloviev is a scale to measure/describe the intensity of waves generated by a Tsunami wave. Like earthquake (Richter’scale), it’s a logarithmic scale from -6 to 9 (source)

df2 %>% na.omit() %>% ggplot() + borders("world",colour="grey75",fill="white") + geom_point(aes(x=LONGITUDE, y = LATITUDE,color=INTENSITY_SOLOVIEV),size=.5,alpha=.5) + xlab("LONGITUDE") + ylab("LATITUDE") + theme(legend.position='top')

Historic
g1<-as.data.frame(df %>% filter(YEAR>1800) %>% group_by(YEAR) %>% summarise(number=n())) %>% ggplot() + geom_bar(stat='identity',aes(x=YEAR,y=number)) + ggtitle("Tsunamis counted in recent history (year~1800)")
g2<-as.data.frame(df %>% group_by(YEAR) %>% summarise(number=n())) %>% ggplot() + geom_bar(stat='identity',aes(x=YEAR,y=number)) + ggtitle("All Tsunamis counted since -2000 BC")

grid.arrange(g2,g1,ncol=1)

Tsunamis per Country

How-to

I can group the rows per COUNTRY , plot the number of tsunamis per country (summed over years), however the geolocations are different for the same country (I guess the geolocation is based on the LOCATION of the tsunami, not the COUNTRY).

df %>% filter(COUNTRY=='SYRIA') %>% select(COUNTRY,LATITUDE,LONGITUDE)
##   COUNTRY LATITUDE LONGITUDE
## 1   SYRIA   35.683      35.8
## 2   SYRIA   35.683      35.8
## 3   SYRIA   32.000      35.5
## 4   SYRIA       NA        NA
## 5   SYRIA   33.000      35.0
## 6   SYRIA   33.500      36.0
## 7   SYRIA   35.000      37.0
## 8   SYRIA       NA        NA

Therefore to get only one geolocation per COUNTRY, i can take the average of each coordinate (don’t want to use geocode()) (with removal of NA’s)

df %>% filter(COUNTRY=='SYRIA') %>% select(COUNTRY,LATITUDE,LONGITUDE) %>% summarise(number= n(), meanLat = mean(LATITUDE,na.rm=TRUE), meanLong = mean(LONGITUDE,na.rm=TRUE))
##   number  meanLat meanLong
## 1      8 34.14433    35.85
Map
df %>% select(COUNTRY,LATITUDE,LONGITUDE) %>% group_by(COUNTRY) %>% summarise(number= n(), meanLat = mean(LATITUDE,na.rm=TRUE), meanLong = mean(LONGITUDE,na.rm=TRUE)) %>% ggplot() + borders("world",colour="grey75",fill="white") + geom_point(aes(x=meanLong, y = meanLat,size=number),alpha=.5) + xlab("LONGITUDE") + ylab("LATITUDE") + theme(legend.position='top')

As a comment, we see that for example, the USA point is located in California, since the geolocations are the mean, taken by Country, as that most of the tsunamis in the USA occured on the West coast (earthquakes –> tsunamis)

Distribution
res<-as.data.frame(df %>% select(COUNTRY,LATITUDE,LONGITUDE) %>% group_by(COUNTRY) %>% summarise(number= n(), meanLat = mean(LATITUDE,na.rm=TRUE), meanLong = mean(LONGITUDE,na.rm=TRUE)) %>% arrange(-number))
res$COUNTRY <- reorder(res$COUNTRY, -res$number)
res$Percentage <- res$number / sum(res$number) * 100
res$LABEL <-paste0(round(res$Percentage,1),"%")
ggplot(res, aes(x=COUNTRY, y=number)) + geom_bar(width = 0.9, stat="identity") + xlab('') + ylab('') + geom_text(aes(label=LABEL), position=position_dodge(width=0.9), hjust=.1,vjust=-0.25,size=2) + theme(axis.text.x = element_text(angle=90, hjust=1),text = element_text(size=6))

JAPAN is the country that had the most Tsunamis.

Animation tests

My goal was to make an animation with R to display each Tsunami location. I found 2 packages : animation and gganimate (which is also a wrapper for animation).

As the rows of the dataframe have the YEAR column, I can just loop over the rows and plot both Longitude and Latitude. It works offline (although you will need ImageMagick installed on your machine)

#saveGIF({
#  for(i in 1:20){
#    m<-ggplot() + borders("world",colour="grey75",fill="white") + geom_point(data=df3[i,],aes(x=LONGITUDE, y = LATITUDE))
#    print(m)
#    }
#},interval =1,movie.name = 'test.gif',ani.width = 600, ani.height = 600)
test with small number of entries
#m<-ggplot(data=filter(df2,YEAR<0),aes(x=LONGITUDE, y = LATITUDE, color=factor(CAUSE),frame=YEAR)) + #borders("world",colour="grey75",fill="white") + geom_point() + theme(legend.position='top')
#gganimate(m)

History :

  • version 1 : initial commit