options(width=100)
knitr::opts_chunk$set(out.width='1000px',dpi=200,message=FALSE,warning=FALSE)
#load packages and csv file
library(ggplot2)
library(dplyr)
library(gridExtra)
library(Amelia)
library(ggmap)
library(gganimate)
library(animation)
The dataset represents 2,000 source events from a Tsunami database with information about the geolocations of each event, magnitude of the Tsunami, injuries and damages, from -2000 BC to present.
df<-read.csv('sources.csv',sep=',')
#str(df)
There are a lot of empty columns/rows or NA :
#missmap(df, main="Missings Map", col=c("yellow", "black"),y.cex = 0.4, x.cex = 0.6, legend=FALSE)
But as I will look at the events vs time / geolocations, the removal of empty rows will not be too big.
INTENSITY_SOLOVIEV
df2<-as.data.frame(df %>% select(YEAR,CAUSE,MONTH,COUNTRY,LOCATION,LATITUDE,LONGITUDE,INTENSITY_SOLOVIEV))
The Intensity_Voloviev
is a scale to measure/describe the intensity of waves generated by a Tsunami wave. Like earthquake (Richter’scale), it’s a logarithmic scale from -6 to 9 (source)
df2 %>% na.omit() %>% ggplot() + borders("world",colour="grey75",fill="white") + geom_point(aes(x=LONGITUDE, y = LATITUDE,color=INTENSITY_SOLOVIEV),size=.5,alpha=.5) + xlab("LONGITUDE") + ylab("LATITUDE") + theme(legend.position='top')
g1<-as.data.frame(df %>% filter(YEAR>1800) %>% group_by(YEAR) %>% summarise(number=n())) %>% ggplot() + geom_bar(stat='identity',aes(x=YEAR,y=number)) + ggtitle("Tsunamis counted in recent history (year~1800)")
g2<-as.data.frame(df %>% group_by(YEAR) %>% summarise(number=n())) %>% ggplot() + geom_bar(stat='identity',aes(x=YEAR,y=number)) + ggtitle("All Tsunamis counted since -2000 BC")
grid.arrange(g2,g1,ncol=1)
I can group the rows per COUNTRY
, plot the number of tsunamis per country (summed over years), however the geolocations are different for the same country (I guess the geolocation is based on the LOCATION
of the tsunami, not the COUNTRY
).
df %>% filter(COUNTRY=='SYRIA') %>% select(COUNTRY,LATITUDE,LONGITUDE)
## COUNTRY LATITUDE LONGITUDE
## 1 SYRIA 35.683 35.8
## 2 SYRIA 35.683 35.8
## 3 SYRIA 32.000 35.5
## 4 SYRIA NA NA
## 5 SYRIA 33.000 35.0
## 6 SYRIA 33.500 36.0
## 7 SYRIA 35.000 37.0
## 8 SYRIA NA NA
Therefore to get only one geolocation per COUNTRY
, i can take the average of each coordinate (don’t want to use geocode()
) (with removal of NA’s)
df %>% filter(COUNTRY=='SYRIA') %>% select(COUNTRY,LATITUDE,LONGITUDE) %>% summarise(number= n(), meanLat = mean(LATITUDE,na.rm=TRUE), meanLong = mean(LONGITUDE,na.rm=TRUE))
## number meanLat meanLong
## 1 8 34.14433 35.85
df %>% select(COUNTRY,LATITUDE,LONGITUDE) %>% group_by(COUNTRY) %>% summarise(number= n(), meanLat = mean(LATITUDE,na.rm=TRUE), meanLong = mean(LONGITUDE,na.rm=TRUE)) %>% ggplot() + borders("world",colour="grey75",fill="white") + geom_point(aes(x=meanLong, y = meanLat,size=number),alpha=.5) + xlab("LONGITUDE") + ylab("LATITUDE") + theme(legend.position='top')
As a comment, we see that for example, the USA
point is located in California, since the geolocations are the mean, taken by Country, as that most of the tsunamis in the USA occured on the West coast (earthquakes –> tsunamis)
res<-as.data.frame(df %>% select(COUNTRY,LATITUDE,LONGITUDE) %>% group_by(COUNTRY) %>% summarise(number= n(), meanLat = mean(LATITUDE,na.rm=TRUE), meanLong = mean(LONGITUDE,na.rm=TRUE)) %>% arrange(-number))
res$COUNTRY <- reorder(res$COUNTRY, -res$number)
res$Percentage <- res$number / sum(res$number) * 100
res$LABEL <-paste0(round(res$Percentage,1),"%")
ggplot(res, aes(x=COUNTRY, y=number)) + geom_bar(width = 0.9, stat="identity") + xlab('') + ylab('') + geom_text(aes(label=LABEL), position=position_dodge(width=0.9), hjust=.1,vjust=-0.25,size=2) + theme(axis.text.x = element_text(angle=90, hjust=1),text = element_text(size=6))
JAPAN
is the country that had the most Tsunamis.
My goal was to make an animation with R
to display each Tsunami location. I found 2 packages : animation
and gganimate
(which is also a wrapper for animation
).
As the rows of the dataframe have the YEAR
column, I can just loop over the rows and plot both Longitude
and Latitude
. It works offline (although you will need ImageMagick
installed on your machine)
#saveGIF({
# for(i in 1:20){
# m<-ggplot() + borders("world",colour="grey75",fill="white") + geom_point(data=df3[i,],aes(x=LONGITUDE, y = LATITUDE))
# print(m)
# }
#},interval =1,movie.name = 'test.gif',ani.width = 600, ani.height = 600)
#m<-ggplot(data=filter(df2,YEAR<0),aes(x=LONGITUDE, y = LATITUDE, color=factor(CAUSE),frame=YEAR)) + #borders("world",colour="grey75",fill="white") + geom_point() + theme(legend.position='top')
#gganimate(m)
History :