options(width=100)
knitr::opts_chunk$set(out.width='1000px',dpi=200,message=FALSE,warning=FALSE)
#load packages and csv file
library(ggplot2)
library(dplyr)
library(gridExtra)
library(grid)
library(RColorBrewer)
library(ggrepel)
library(ggthemes)
library(viridis)
library(ggjoy)

1 Data preparation

df<-read.csv('ufo_sighting_data.csv',sep=',',stringsAsFactors=F)
df$latitude<-as.numeric(df$latitude)
df$year<-sapply(df$Date_time, function(x) as.numeric(strsplit(strsplit(x," ")[[1]][1],"/")[[1]][3]))
df$month<-sapply(df$Date_time, function(x) as.numeric(strsplit(strsplit(x," ")[[1]][1],"/")[[1]][1]))
df$day<-sapply(df$Date_time, function(x) as.numeric(strsplit(strsplit(x," ")[[1]][1],"/")[[1]][2]))
df$hour<-sapply(df$Date_time, function(x) as.numeric(strsplit(strsplit(x," ")[[1]][2],":")[[1]][1]))
df$min<-sapply(df$Date_time, function(x) as.numeric(strsplit(strsplit(x," ")[[1]][2],":")[[1]][2]))
df$month_name<-sapply(df$month, function(x) month.name[x])
df$month_name_ordered<-factor(df$month_name, levels =c(month.name))
df$DATE<-sapply(df$Date_time, function(x) strsplit(x,' ')[[1]][1])
df$DATE_2<-as.Date(df$DATE,"%m/%d/%Y")
df$weekday <- factor(weekdays(df$DATE_2, T), levels = rev(c("Mon", "Tue", "Wed", "Thu","Fri", "Sat", "Sun")))
df$length_min<-as.numeric(df$length_of_encounter_seconds)/60
blank_theme <- theme_fivethirtyeight()+
  theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.border = element_blank(),
  panel.grid=element_blank(),
  axis.ticks = element_blank(),
  plot.title=element_text(size=14, face="bold")
  )

2 Overview

g1<-ggplotGrob(
  df %>% 
    group_by(UFO_shape) %>% 
    summarize(count=n()) %>% 
    ggplot(aes(x="",y=count,fill=UFO_shape)) + 
    geom_bar(width = 1, stat = "identity",color='white',size=.25) + coord_polar("y") + 
    scale_fill_manual(name="",values=viridis::inferno(length(unique(df$UFO_shape)))) + 
    blank_theme + 
    theme(axis.text.x=element_blank(), 
          legend.text=element_text(size=8),
          legend.key.size = unit(.3, "cm")))

g2<-df %>% 
  group_by(year,country) %>% 
  summarize(count=n()) %>%
  ggplot(aes(x=year,y=count,fill=country)) + 
  geom_histogram(stat='identity',width=1,color='white',size=.25) + 
  scale_fill_brewer(palette='Paired') + geom_vline(xintercept=1993,color='black') + 
  annotate("text", x = 1985, y = 6000, label = "X-Files\nTV-show starts",color='black') + 
  theme_fivethirtyeight() + 
  ggtitle('UFO sightings overview\nby year, shape, location') + 
  theme(legend.position='right',legend.direction='vertical')
g2 + annotation_custom(grob = g1, xmin = 1900, xmax = 1950, ymin = 1500, ymax = 6500)

2.1 In details : Location

countries_map <-map_data("world")
world_map<-ggplot() + 
  geom_map(data = countries_map, 
           map = countries_map,aes(x = long, y = lat, map_id = region, group = group),
           fill = "white", color = "black", size = 0.1)

world_map + geom_point(data=df,aes(x=longitude,y=latitude),alpha=.5,size=.25) + theme_fivethirtyeight() + ggtitle('Location of UFO sightings')

  • For sure ET likes the US …
  • so since most of the observations were done here
states_map<-map_data("state")
usMap<-ggplot() + 
  geom_map(data = states_map, map = states_map,aes(x = long, y = lat, map_id = region, group = group),fill = "white", color = "black", size = 0.1) + 
  theme_fivethirtyeight()

usMap + 
  geom_point(data=filter(df,country=='us' & state.province!='hi' & state.province!='ak' & state.province!='pr'),aes(x=longitude,y=latitude),alpha=.75,size=.5) + ggtitle('Location of UFO sightings in the US')

  • yep … ET prefers the east coast and the big cities …

2.2 Time during the day

df %>% group_by(hour,month_name_ordered) %>% summarize(count=n()) %>% ggplot(aes(x=factor(hour),y=count,color=month_name_ordered,group=month_name_ordered)) + geom_line() + theme_fivethirtyeight() + scale_color_manual(name="",values=brewer.pal(12,'Paired')) + geom_point(color='black',size=.5,alpha=1) + ggtitle('# of UFO observations during the day')

  • mostly in evening
  • also note that the peak decreases during winter

2.3 Time during the week

df %>% group_by(weekday,month_name_ordered) %>% summarize(count=n()) %>% ggplot(aes(x=weekday,y=count,color=month_name_ordered,group=month_name_ordered)) + geom_line() + theme_fivethirtyeight() + scale_color_brewer(name="",palette='Paired') + geom_point(color='black',size=.5,alpha=1) + ggtitle('# of UFO observations during the week')

  • mostly during the weekend … for sure ET has a busy week at work and prefers to go outside during the weekend.

3 Duration of the encounter

Let’s take a look at the duration of the encounter.

3.1 Outliers

First a look at the distribution of the duration (in minutes) reveals that there are some very large outliers. For example :

df %>% filter(length_min>1e6) %>% select(-c(length_of_encounter_seconds,DATE_2,description,year, month, day ,hour, min, month_name, month_name_ordered,state.province))
##         Date_time                    city country UFO_shape described_duration_of_encounter
## 1 10/1/1983 17:00 birmingham (uk/england)      gb    sphere                        31 years
## 2  6/3/2010 23:30         ottawa (canada)      ca     other                        23000hrs
## 3 9/15/1991 18:00              greenbrier      us     light                        21 years
##   date_documented latitude  longitude      DATE weekday length_min
## 1       4/12/2013 52.46667  -1.916667 10/1/1983     Sat    1630600
## 2        7/6/2010 45.41667 -75.700000  6/3/2010     Thu    1380000
## 3       3/31/2008 35.23389 -92.387500 9/15/1991     Sun    1104600
  • it seems there is a typo in the largest encounter because 97836000 makes 3.1 years, however the difference between the date_documented and the Date_time is 30 years
  • but even though 30 or 21 years is a very long time
df %>% 
  select(length_min, country) %>% 
  ggplot(aes(x=log(length_min), y= country)) + 
  geom_joy(scale = 1.5) + theme_joy() + 
  ggtitle("log(encounter's duration) [min] vs. country") + xlab('minutes(log scale)') + ylab('Country')

3.2 Average duration by UFO shape

df %>% 
  select(UFO_shape,length_min) %>% 
  group_by(UFO_shape) %>% 
  mutate(medianLength = median(log(length_min))) %>% 
  ggplot(aes(x=reorder(UFO_shape, medianLength),y=log(length_min),fill=UFO_shape)) + 
  geom_boxplot(colour='black',size=.4,alpha=.5) + 
  geom_jitter(shape=16,position=position_jitter(0.2),size=.25) + 
  coord_flip() + theme_fivethirtyeight() + 
  scale_fill_manual(values=viridis::inferno(length(unique(df$UFO_shape)))) + 
  theme(legend.position='right',legend.direction='vertical') + 
  ggtitle('Log(duration) [min] vs. UFO shape')

  • the shortest durations are found for shapes like flash, fireball, ie quick events
  • the longest durations are found for bigger shapes, like diamond, sphere, cone

History :

  • version 1 : initial commit