Data acquisition and transformation

Monthly U.S. citizen departures are collected and reported in Tourism Industries U.S. International Air Travel Statistics (I-92 data) Program. Each month NTTO processes and reports outbound figures in the “U.S. International Air Passenger Statistics Report”.
Detailed description of data collection methods for Global Terrorism Database can be found here: https://www.start.umd.edu/gtd/using-gtd/

Travel

links <- getHTMLLinks(readLines("https://travel.trade.gov/research/monthly/departures/"))
links <- links[-1]  # removing the first link as it doesn't contain the data I'm looking for
for (i in 1:length(links)) {
    links[i] <- paste("https://travel.trade.gov", links[i], collapse = "")
}
links <- str_replace_all(links, pattern = " ", replacement = "")
# now i have all the links where i can scrape travel data
# from
n = 2016
travel <- data.frame(month = character(), outbound = numeric())
# tables_dim <- c(NA) # used this line as intermediary step
# to understand html tables structure
for (i in 1:length(links)) {
    url <- readLines(links[i])
    table <- readHTMLTable(url, header = TRUE)
    # tables_dim <- c(tables_dim,length(table)) # used this line
    # as intermediary step to understand html tables structure
    if (length(table) == 2 | 4 & n > 2001) {
        j = 2
    } else if (length(table) == 3 & n < 2005) {
        j = 2
    } else {
        j = 1
    }
    # converting table from HTML file into a data frame
    table <- na.omit(as.data.frame(table[j]))
    # selecting only rows that have region of interest and 12
    # columns for months)
    table <- table[(table[, 1] == "Europe"), 2:13]
    table <- table[1, ]
    colnames(table) <- c(paste(month.abb[c(1:12)], "1", n))
    table <- table %>% gather(month, outbound, 1:12)
    travel <- rbind(travel, table)
    n <- n - 1
}
travel <- travel %>% mutate(outbound = str_replace_all(outbound, 
    pattern = ",", replacement = ""))
travel$outbound <- as.numeric(travel$outbound)
travel$month <- as.Date(travel$month, format = "%b %d %Y")
# I have noticed that data in year before 2000 showed numbers
# in thousands, so i am correcting it in this step
travel <- travel %>% mutate(outbound = ifelse(month < "2000-01-01", 
    outbound * 1000, outbound))
traveltest <- travel %>% mutate(year = year(month)) %>% mutate(month = month(month))

Terrorism

files_list <- c("https://raw.githubusercontent.com/agCS/DATA606/master/GTD-Export-11to50.csv", 
    "https://raw.githubusercontent.com/agCS/DATA606/master/GTD-Export-50to100.csv", 
    "https://raw.githubusercontent.com/agCS/DATA606/master/GTD-Export-100plus.csv")
terror <- data.frame(matrix(nrow = 0, ncol = 7))
for (i in 1:length(files_list)) {
    tmp <- read.csv(files_list[i], stringsAsFactors = FALSE)
    tmp <- as.data.frame(tmp, header = TRUE)
    tmp <- tmp %>% select(DATE, COUNTRY, CITY, FATALITIES, INJURED, 
        REGION, ATTACK.TYPE.1)
    terror <- rbind(terror, tmp)
}
terror$DATE <- as.Date(terror$DATE)
terror$FATALITIES <- as.numeric(terror$FATALITIES)
terror$INJURED <- as.numeric(terror$INJURED)

# adding new columns: victims, year, month for quick
# summaries
terror <- terror %>% mutate(victims = FATALITIES + INJURED) %>% 
    mutate(year = year(DATE)) %>% mutate(month = month(DATE))
terrortest <- terror %>% na.omit() %>% group_by(year, month) %>% 
    summarize(fatal = sum(FATALITIES), injured = sum(INJURED), 
        attacks = n()) %>% arrange(desc(year), desc(month))

Combined

comb <- merge(traveltest, terrortest, by = c("year", "month")) %>% 
    arrange(desc(year), desc(month))
# If need to keep NAs comb.na <-
# merge(traveltest,terrortest,by=c('year','month'), all =
# TRUE) %>% arrange(desc(year),desc(month))
# comb.na[is.na(comb.na)] <- 0

Exploratory data analysis

Travel

describe(travel$outbound)
##    vars   n     mean     sd   median  trimmed      mad    min     max
## X1    1 252 973841.9 290583 930029.5 958781.7 325103.8 414958 1837000
##      range skew kurtosis       se
## X1 1422042 0.43    -0.53 18305.01
ggplot(na.omit(travel)) + geom_line(mapping = aes(x = month, 
    y = outbound)) + ylim(4e+05, 2e+06) + labs(title = "U.S. citizens outbound travel of to Europe", 
    x = "", y = "")

travel.avg <- travel %>% group_by(year = year(month)) %>% summarize(avg = mean(outbound))
ggplot(travel.avg) + geom_line(mapping = aes(x = year, y = avg), 
    color = "red") + ylim(4e+05, 2e+06) + labs(title = "U.S. citizens yearly average travel to Europe", 
    x = "", y = "")

ggplot() + geom_point(data = traveltest, mapping = aes(x = year, 
    y = outbound, color = month)) + geom_line(data = travel.avg, 
    mapping = aes(x = year, y = avg), color = "red") + ylim(4e+05, 
    2e+06) + labs(title = "Travel yearly average", color = "Month") + 
    scale_color_gradient(low = "white", high = "black") + labs(title = "Average travel per year", 
    subtitle = "Travel below average in colder months", x = "", 
    y = "")

Terrorism

summary(terror)
##       DATE              COUNTRY              CITY          
##  Min.   :1996-01-09   Length:262         Length:262        
##  1st Qu.:2001-05-11   Class :character   Class :character  
##  Median :2005-07-04   Mode  :character   Mode  :character  
##  Mean   :2006-08-01                                        
##  3rd Qu.:2013-05-23                                        
##  Max.   :2016-12-19                                        
##                                                            
##    FATALITIES        INJURED          REGION          ATTACK.TYPE.1     
##  Min.   :  0.00   Min.   :  0.00   Length:262         Length:262        
##  1st Qu.:  1.00   1st Qu.: 11.75   Class :character   Class :character  
##  Median :  4.00   Median : 18.00   Mode  :character   Mode  :character  
##  Mean   : 12.41   Mean   : 44.12                                        
##  3rd Qu.: 11.00   3rd Qu.: 39.25                                        
##  Max.   :344.00   Max.   :727.00                                        
##  NA's   :6        NA's   :6                                             
##     victims             year          month       
##  Min.   :  11.00   Min.   :1996   Min.   : 1.000  
##  1st Qu.:  15.00   1st Qu.:2001   1st Qu.: 5.000  
##  Median :  23.50   Median :2005   Median : 8.000  
##  Mean   :  56.52   Mean   :2006   Mean   : 7.095  
##  3rd Qu.:  46.50   3rd Qu.:2013   3rd Qu.:10.000  
##  Max.   :1071.00   Max.   :2016   Max.   :12.000  
##  NA's   :6
describe(terror$FATALITIES)
##    vars   n  mean    sd median trimmed  mad min max range skew kurtosis
## X1    1 256 12.41 32.38      4     6.2 5.93   0 344   344 7.37    65.15
##      se
## X1 2.02
# time series by type and region
ggplot(data = na.omit(terror), mapping = aes(x = DATE, y = FATALITIES, 
    color = ATTACK.TYPE.1, shape = REGION)) + geom_point() + 
    labs(title = "Europe terrorist attacks fatalities by region and type", 
        x = "", y = "", color = "Attack type", shape = "Region")

# map
terror.map <- terror %>% group_by(COUNTRY) %>% summarize(attacks = n())
map <- joinCountryData2Map(terror.map, joinCode = "NAME", nameJoinColumn = "COUNTRY")
## 18 codes from your data successfully matched countries in the map
## 1 codes from your data failed to match with a country code in the map
## 225 codes from the map weren't represented in your data
mapCountryData(map, nameColumnToPlot = "attacks", mapRegion = "europe", 
    catMethod = c(0:150), colourPalette = "white2Black")

# boxplot eastern vs western europe
terror %>% na.omit() %>% ggplot(aes(x = REGION, y = victims)) + 
    geom_boxplot() + labs(title = "Total victims distribution by region", 
    x = "", y = "")

ggplot(na.omit(terror)) + geom_bar(mapping = aes(x = COUNTRY, 
    fill = ATTACK.TYPE.1)) + coord_flip() + labs(title = "Total attacks by country", 
    x = "", y = "", fill = "Attack type")

# Russia plots - optional
na.omit(terror) %>% filter(COUNTRY != "Russia") %>% group_by(COUNTRY, 
    ATTACK.TYPE.1) %>% summarize(count = n()) %>% # arrange(desc(count)) %>%
ggplot() + geom_bar(stat = "identity", mapping = aes(x = reorder(COUNTRY, 
    count), y = count, fill = ATTACK.TYPE.1), position = "stack") + 
    coord_flip() + labs(title = "Total attacks by country exluding biggest outlier (Russia)", 
    x = "", y = "", fill = "Attack type")

terror %>% na.omit() %>% filter(COUNTRY == "Russia") %>% ggplot(mapping = aes(x = year, 
    fill = ATTACK.TYPE.1)) + geom_bar() + labs(title = "Russia terrorist attacks per year by type", 
    x = "", y = "", fill = "Attack type")

terror %>% na.omit() %>% filter(COUNTRY == "Russia") %>% ggplot(mapping = aes(x = year, 
    y = FATALITIES, color = ATTACK.TYPE.1)) + geom_point() + 
    labs(title = "Russia terrorist attacks fatalities per year by type", 
        x = "", y = "", color = "Attack type")

Combined

Please see more plots under Regression section.

ggplot(data = comb) + geom_bar(mapping = aes(x = attacks)) + 
    labs(title = "Monthly attacks", subtitle = "Attacks per month are heavily skewed", 
        x = "Number of attacks", y = "")

Regression models - optional

All of the below models suggest there is no relationship between attacks/number of victims and outbound travel to Europe.

# all attacks and outbound
ggplot(data = comb) + geom_point(mapping = aes(x = attacks, y = outbound, 
    color = year)) + labs(title = "Attacks per month vs monthly travel", 
    x = "", y = "", color = "Year")

m1 <- lm(outbound ~ attacks, data = comb)
summary(m1)
## 
## Call:
## lm(formula = outbound ~ attacks, data = comb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -580974 -192980   -3015  205152  653128 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   990810      40009   24.77   <2e-16 ***
## attacks         5122      16026    0.32     0.75    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 275700 on 127 degrees of freedom
## Multiple R-squared:  0.0008036,  Adjusted R-squared:  -0.007064 
## F-statistic: 0.1021 on 1 and 127 DF,  p-value: 0.7498
# yearly attacks and average outbound per year
comb2 <- comb %>% group_by(year) %>% summarize(attacks = sum(attacks), 
    outbound = mean(outbound))
ggplot(data = comb2) + geom_point(mapping = aes(x = attacks, 
    y = outbound, color = year)) + labs(title = "Attacks per year vs annual average travel", 
    x = "", y = "", color = "Year")

m2 <- lm(outbound ~ attacks, data = comb2)
summary(m2)
## 
## Call:
## lm(formula = outbound ~ attacks, data = comb2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -268491  -57564   35100   61160  113655 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   979789      45062  21.743 6.93e-15 ***
## attacks         1812       3258   0.556    0.585    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 97580 on 19 degrees of freedom
## Multiple R-squared:  0.01602,    Adjusted R-squared:  -0.03577 
## F-statistic: 0.3093 on 1 and 19 DF,  p-value: 0.5846
# avg victims and average outbound per year
comb3 <- comb %>% group_by(year) %>% summarize(victims = mean(fatal + 
    injured), outbound = mean(outbound))
ggplot(data = comb3) + geom_point(mapping = aes(x = victims, 
    y = outbound, color = year)) + labs(title = "Average vistims per year vs annual average travel", 
    x = "", y = "", color = "Year")

m3 <- lm(outbound ~ victims, data = comb3)
summary(m3)
## 
## Call:
## lm(formula = outbound ~ victims, data = comb3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -266531  -55029   38321   50041  120632 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 991896.34   31619.46  31.370   <2e-16 ***
## victims         97.73     228.32   0.428    0.673    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 97900 on 19 degrees of freedom
## Multiple R-squared:  0.009551,   Adjusted R-squared:  -0.04258 
## F-statistic: 0.1832 on 1 and 19 DF,  p-value: 0.6734