Monthly U.S. citizen departures are collected and reported in Tourism Industries U.S. International Air Travel Statistics (I-92 data) Program. Each month NTTO processes and reports outbound figures in the “U.S. International Air Passenger Statistics Report”.
Detailed description of data collection methods for Global Terrorism Database can be found here: https://www.start.umd.edu/gtd/using-gtd/
links <- getHTMLLinks(readLines("https://travel.trade.gov/research/monthly/departures/"))
links <- links[-1] # removing the first link as it doesn't contain the data I'm looking for
for (i in 1:length(links)) {
links[i] <- paste("https://travel.trade.gov", links[i], collapse = "")
}
links <- str_replace_all(links, pattern = " ", replacement = "")
# now i have all the links where i can scrape travel data
# from
n = 2016
travel <- data.frame(month = character(), outbound = numeric())
# tables_dim <- c(NA) # used this line as intermediary step
# to understand html tables structure
for (i in 1:length(links)) {
url <- readLines(links[i])
table <- readHTMLTable(url, header = TRUE)
# tables_dim <- c(tables_dim,length(table)) # used this line
# as intermediary step to understand html tables structure
if (length(table) == 2 | 4 & n > 2001) {
j = 2
} else if (length(table) == 3 & n < 2005) {
j = 2
} else {
j = 1
}
# converting table from HTML file into a data frame
table <- na.omit(as.data.frame(table[j]))
# selecting only rows that have region of interest and 12
# columns for months)
table <- table[(table[, 1] == "Europe"), 2:13]
table <- table[1, ]
colnames(table) <- c(paste(month.abb[c(1:12)], "1", n))
table <- table %>% gather(month, outbound, 1:12)
travel <- rbind(travel, table)
n <- n - 1
}
travel <- travel %>% mutate(outbound = str_replace_all(outbound,
pattern = ",", replacement = ""))
travel$outbound <- as.numeric(travel$outbound)
travel$month <- as.Date(travel$month, format = "%b %d %Y")
# I have noticed that data in year before 2000 showed numbers
# in thousands, so i am correcting it in this step
travel <- travel %>% mutate(outbound = ifelse(month < "2000-01-01",
outbound * 1000, outbound))
traveltest <- travel %>% mutate(year = year(month)) %>% mutate(month = month(month))
files_list <- c("https://raw.githubusercontent.com/agCS/DATA606/master/GTD-Export-11to50.csv",
"https://raw.githubusercontent.com/agCS/DATA606/master/GTD-Export-50to100.csv",
"https://raw.githubusercontent.com/agCS/DATA606/master/GTD-Export-100plus.csv")
terror <- data.frame(matrix(nrow = 0, ncol = 7))
for (i in 1:length(files_list)) {
tmp <- read.csv(files_list[i], stringsAsFactors = FALSE)
tmp <- as.data.frame(tmp, header = TRUE)
tmp <- tmp %>% select(DATE, COUNTRY, CITY, FATALITIES, INJURED,
REGION, ATTACK.TYPE.1)
terror <- rbind(terror, tmp)
}
terror$DATE <- as.Date(terror$DATE)
terror$FATALITIES <- as.numeric(terror$FATALITIES)
terror$INJURED <- as.numeric(terror$INJURED)
# adding new columns: victims, year, month for quick
# summaries
terror <- terror %>% mutate(victims = FATALITIES + INJURED) %>%
mutate(year = year(DATE)) %>% mutate(month = month(DATE))
terrortest <- terror %>% na.omit() %>% group_by(year, month) %>%
summarize(fatal = sum(FATALITIES), injured = sum(INJURED),
attacks = n()) %>% arrange(desc(year), desc(month))
comb <- merge(traveltest, terrortest, by = c("year", "month")) %>%
arrange(desc(year), desc(month))
# If need to keep NAs comb.na <-
# merge(traveltest,terrortest,by=c('year','month'), all =
# TRUE) %>% arrange(desc(year),desc(month))
# comb.na[is.na(comb.na)] <- 0
describe(travel$outbound)
## vars n mean sd median trimmed mad min max
## X1 1 252 973841.9 290583 930029.5 958781.7 325103.8 414958 1837000
## range skew kurtosis se
## X1 1422042 0.43 -0.53 18305.01
ggplot(na.omit(travel)) + geom_line(mapping = aes(x = month,
y = outbound)) + ylim(4e+05, 2e+06) + labs(title = "U.S. citizens outbound travel of to Europe",
x = "", y = "")
travel.avg <- travel %>% group_by(year = year(month)) %>% summarize(avg = mean(outbound))
ggplot(travel.avg) + geom_line(mapping = aes(x = year, y = avg),
color = "red") + ylim(4e+05, 2e+06) + labs(title = "U.S. citizens yearly average travel to Europe",
x = "", y = "")
ggplot() + geom_point(data = traveltest, mapping = aes(x = year,
y = outbound, color = month)) + geom_line(data = travel.avg,
mapping = aes(x = year, y = avg), color = "red") + ylim(4e+05,
2e+06) + labs(title = "Travel yearly average", color = "Month") +
scale_color_gradient(low = "white", high = "black") + labs(title = "Average travel per year",
subtitle = "Travel below average in colder months", x = "",
y = "")
summary(terror)
## DATE COUNTRY CITY
## Min. :1996-01-09 Length:262 Length:262
## 1st Qu.:2001-05-11 Class :character Class :character
## Median :2005-07-04 Mode :character Mode :character
## Mean :2006-08-01
## 3rd Qu.:2013-05-23
## Max. :2016-12-19
##
## FATALITIES INJURED REGION ATTACK.TYPE.1
## Min. : 0.00 Min. : 0.00 Length:262 Length:262
## 1st Qu.: 1.00 1st Qu.: 11.75 Class :character Class :character
## Median : 4.00 Median : 18.00 Mode :character Mode :character
## Mean : 12.41 Mean : 44.12
## 3rd Qu.: 11.00 3rd Qu.: 39.25
## Max. :344.00 Max. :727.00
## NA's :6 NA's :6
## victims year month
## Min. : 11.00 Min. :1996 Min. : 1.000
## 1st Qu.: 15.00 1st Qu.:2001 1st Qu.: 5.000
## Median : 23.50 Median :2005 Median : 8.000
## Mean : 56.52 Mean :2006 Mean : 7.095
## 3rd Qu.: 46.50 3rd Qu.:2013 3rd Qu.:10.000
## Max. :1071.00 Max. :2016 Max. :12.000
## NA's :6
describe(terror$FATALITIES)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 256 12.41 32.38 4 6.2 5.93 0 344 344 7.37 65.15
## se
## X1 2.02
# time series by type and region
ggplot(data = na.omit(terror), mapping = aes(x = DATE, y = FATALITIES,
color = ATTACK.TYPE.1, shape = REGION)) + geom_point() +
labs(title = "Europe terrorist attacks fatalities by region and type",
x = "", y = "", color = "Attack type", shape = "Region")
# map
terror.map <- terror %>% group_by(COUNTRY) %>% summarize(attacks = n())
map <- joinCountryData2Map(terror.map, joinCode = "NAME", nameJoinColumn = "COUNTRY")
## 18 codes from your data successfully matched countries in the map
## 1 codes from your data failed to match with a country code in the map
## 225 codes from the map weren't represented in your data
mapCountryData(map, nameColumnToPlot = "attacks", mapRegion = "europe",
catMethod = c(0:150), colourPalette = "white2Black")
# boxplot eastern vs western europe
terror %>% na.omit() %>% ggplot(aes(x = REGION, y = victims)) +
geom_boxplot() + labs(title = "Total victims distribution by region",
x = "", y = "")
ggplot(na.omit(terror)) + geom_bar(mapping = aes(x = COUNTRY,
fill = ATTACK.TYPE.1)) + coord_flip() + labs(title = "Total attacks by country",
x = "", y = "", fill = "Attack type")
# Russia plots - optional
na.omit(terror) %>% filter(COUNTRY != "Russia") %>% group_by(COUNTRY,
ATTACK.TYPE.1) %>% summarize(count = n()) %>% # arrange(desc(count)) %>%
ggplot() + geom_bar(stat = "identity", mapping = aes(x = reorder(COUNTRY,
count), y = count, fill = ATTACK.TYPE.1), position = "stack") +
coord_flip() + labs(title = "Total attacks by country exluding biggest outlier (Russia)",
x = "", y = "", fill = "Attack type")
terror %>% na.omit() %>% filter(COUNTRY == "Russia") %>% ggplot(mapping = aes(x = year,
fill = ATTACK.TYPE.1)) + geom_bar() + labs(title = "Russia terrorist attacks per year by type",
x = "", y = "", fill = "Attack type")
terror %>% na.omit() %>% filter(COUNTRY == "Russia") %>% ggplot(mapping = aes(x = year,
y = FATALITIES, color = ATTACK.TYPE.1)) + geom_point() +
labs(title = "Russia terrorist attacks fatalities per year by type",
x = "", y = "", color = "Attack type")
Please see more plots under Regression section.
ggplot(data = comb) + geom_bar(mapping = aes(x = attacks)) +
labs(title = "Monthly attacks", subtitle = "Attacks per month are heavily skewed",
x = "Number of attacks", y = "")
All of the below models suggest there is no relationship between attacks/number of victims and outbound travel to Europe.
# all attacks and outbound
ggplot(data = comb) + geom_point(mapping = aes(x = attacks, y = outbound,
color = year)) + labs(title = "Attacks per month vs monthly travel",
x = "", y = "", color = "Year")
m1 <- lm(outbound ~ attacks, data = comb)
summary(m1)
##
## Call:
## lm(formula = outbound ~ attacks, data = comb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -580974 -192980 -3015 205152 653128
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 990810 40009 24.77 <2e-16 ***
## attacks 5122 16026 0.32 0.75
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 275700 on 127 degrees of freedom
## Multiple R-squared: 0.0008036, Adjusted R-squared: -0.007064
## F-statistic: 0.1021 on 1 and 127 DF, p-value: 0.7498
# yearly attacks and average outbound per year
comb2 <- comb %>% group_by(year) %>% summarize(attacks = sum(attacks),
outbound = mean(outbound))
ggplot(data = comb2) + geom_point(mapping = aes(x = attacks,
y = outbound, color = year)) + labs(title = "Attacks per year vs annual average travel",
x = "", y = "", color = "Year")
m2 <- lm(outbound ~ attacks, data = comb2)
summary(m2)
##
## Call:
## lm(formula = outbound ~ attacks, data = comb2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -268491 -57564 35100 61160 113655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 979789 45062 21.743 6.93e-15 ***
## attacks 1812 3258 0.556 0.585
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 97580 on 19 degrees of freedom
## Multiple R-squared: 0.01602, Adjusted R-squared: -0.03577
## F-statistic: 0.3093 on 1 and 19 DF, p-value: 0.5846
# avg victims and average outbound per year
comb3 <- comb %>% group_by(year) %>% summarize(victims = mean(fatal +
injured), outbound = mean(outbound))
ggplot(data = comb3) + geom_point(mapping = aes(x = victims,
y = outbound, color = year)) + labs(title = "Average vistims per year vs annual average travel",
x = "", y = "", color = "Year")
m3 <- lm(outbound ~ victims, data = comb3)
summary(m3)
##
## Call:
## lm(formula = outbound ~ victims, data = comb3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -266531 -55029 38321 50041 120632
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 991896.34 31619.46 31.370 <2e-16 ***
## victims 97.73 228.32 0.428 0.673
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 97900 on 19 degrees of freedom
## Multiple R-squared: 0.009551, Adjusted R-squared: -0.04258
## F-statistic: 0.1832 on 1 and 19 DF, p-value: 0.6734