# install.packages("tidyverse")
library(readxl)
library(tidyr)
library(dplyr)
library(stringr)
library(ggplot2)
My Post: “World Tourism Data”
http://data.un.org/DocumentData.aspx?id=375
The data looks pretty “untidy” and one can transform it and perform several analysis, such as comparing Inbound vs. Outbound tourism numbers, detecting trends through the years, comparing numbers between countries, and etc.
Downloaded XLS file, “Arrivals of non resident tourists_visitors, depart.xls” and saved it as “World Tourism Data.xlsx”
Here’s how the data looks initially: image:
Read XLSX File, skiping the first 5 rows in order to get to proper headers
# Get path [in platform independent way using file.path() function]
path <- file.path("~", "R", "Project 2", "World Tourism Data.xlsx")
df <- read_xlsx(path, skip = 5) %>% tbl_df()
df
## # A tibble: 2,465 x 25
## X__1 COUNTRY X__2 Series `1995` `1996` `1997` `1998` `1999` `2000`
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 4. AFGHANIST~ <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 NA Inbound t~ <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 3 NA Arrivals ~ <NA> .. .. .. .. .. .. ..
## 4 NA Tourism e~ <NA> IMF .. .. .. .. .. ..
## 5 NA Travel - ~ <NA> IMF .. .. .. .. .. ..
## 6 NA Passenger~ <NA> IMF .. .. .. .. .. ..
## 7 NA Outbound ~ <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 8 NA Departure~ <NA> .. .. .. .. .. .. ..
## 9 NA Tourism e~ <NA> IMF .. .. .. .. .. ..
## 10 NA Travel - ~ <NA> IMF .. .. .. .. .. ..
## # ... with 2,455 more rows, and 15 more variables: `2001` <chr>,
## # `2002` <chr>, `2003` <chr>, `2004` <chr>, `2005` <chr>, `2006` <chr>,
## # `2007` <chr>, `2008` <chr>, `2009` <chr>, `2010` <chr>, `2011` <chr>,
## # `2012` <chr>, `2013` <chr>, `2014` <chr>, NOTES <chr>
I decided to look at the data for North America: rows [419:429] for CANADA and rows [2340:2350] for USA
dfCAN <- df[419:429, ]
dfUSA <- df[2340:2350, ]
Data displayed for CANADA
head(dfCAN, n = (429-419))
## # A tibble: 10 x 25
## X__1 COUNTRY X__2 Series `1995` `1996` `1997` `1998` `1999` `2000`
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 124. CANADA <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 NA Inbound t~ <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 3 NA Arrivals ~ <NA> TF 16932 17286 17669 18870 19411 19627
## 4 NA Tourism e~ <NA> IMF 9176 10073 10390 11049 12024 13035
## 5 NA Travel - ~ <NA> IMF 7917 8607 8819 9414 10203 10778
## 6 NA Passenger~ <NA> IMF 1259 1466 1571 1635 1821 2257
## 7 NA Outbound ~ <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 8 NA Departure~ 1/ TF 18206 18973 19111 17648 18368 19182
## 9 NA Tourism e~ <NA> IMF 12658 13815 14151 13384 13999 15125
## 10 NA Travel - ~ <NA> IMF 10260 11254 11480 10849 11499 12438
## # ... with 15 more variables: `2001` <chr>, `2002` <chr>, `2003` <chr>,
## # `2004` <chr>, `2005` <chr>, `2006` <chr>, `2007` <chr>, `2008` <chr>,
## # `2009` <chr>, `2010` <chr>, `2011` <chr>, `2012` <chr>, `2013` <chr>,
## # `2014` <chr>, NOTES <chr>
I’m looking to filter out all the line items which are identified by the following strings under the COUNTRY column:
1. Travel - US$ Mn
2. Passenger transport - US$ Mn
3. Inbound tourism
4. Outbound tourism
dfCAN_1 <- filter(dfCAN, !COUNTRY %in% c("Travel - US$ Mn", "Passenger transport - US$ Mn", "Inbound tourism", "Outbound tourism"))
dfCAN_1
## # A tibble: 5 x 25
## X__1 COUNTRY X__2 Series `1995` `1996` `1997` `1998` `1999` `2000`
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 124. CANADA <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
## 2 NA Arrivals -~ <NA> TF 16932 17286 17669 18870 19411 19627
## 3 NA Tourism ex~ <NA> IMF 9176 10073 10390 11049 12024 13035
## 4 NA Departures~ 1/ TF 18206 18973 19111 17648 18368 19182
## 5 NA Tourism ex~ <NA> IMF 12658 13815 14151 13384 13999 15125
## # ... with 15 more variables: `2001` <chr>, `2002` <chr>, `2003` <chr>,
## # `2004` <chr>, `2005` <chr>, `2006` <chr>, `2007` <chr>, `2008` <chr>,
## # `2009` <chr>, `2010` <chr>, `2011` <chr>, `2012` <chr>, `2013` <chr>,
## # `2014` <chr>, NOTES <chr>
Then I want to gather all the numerical observations for each year under one column, “YEAR”
dfCAN_2 <- gather(dfCAN_1, "YEAR", "n", 5:(5+2014-1995))
dfCAN_2
## # A tibble: 100 x 7
## X__1 COUNTRY X__2 Series NOTES YEAR n
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 124. CANADA <NA> <NA> <NA> 1995 <NA>
## 2 NA Arrivals~ <NA> TF "Note: Different types of met~ 1995 16932
## 3 NA Tourism ~ <NA> IMF <NA> 1995 9176
## 4 NA Departur~ 1/ TF <NA> 1995 18206
## 5 NA Tourism ~ <NA> IMF <NA> 1995 12658
## 6 124. CANADA <NA> <NA> <NA> 1996 <NA>
## 7 NA Arrivals~ <NA> TF "Note: Different types of met~ 1996 17286
## 8 NA Tourism ~ <NA> IMF <NA> 1996 10073
## 9 NA Departur~ 1/ TF <NA> 1996 18973
## 10 NA Tourism ~ <NA> IMF <NA> 1996 13815
## # ... with 90 more rows
Do some cleanup by removing unwanted columns
dfCAN_3 <- dfCAN_2[,c(-1, -3:-5)]
dfCAN_3
## # A tibble: 100 x 3
## COUNTRY YEAR n
## <chr> <chr> <chr>
## 1 CANADA 1995 <NA>
## 2 Arrivals - Thousands 1995 16932
## 3 Tourism expenditure in the country - US$ Mn 1995 9176
## 4 Departures - Thousands 1995 18206
## 5 Tourism expenditure in other countries - US$ Mn 1995 12658
## 6 CANADA 1996 <NA>
## 7 Arrivals - Thousands 1996 17286
## 8 Tourism expenditure in the country - US$ Mn 1996 10073
## 9 Departures - Thousands 1996 18973
## 10 Tourism expenditure in other countries - US$ Mn 1996 13815
## # ... with 90 more rows
Next I want to use spread to turn rows into columns
dfCAN_4 <- spread(dfCAN_3, COUNTRY, n)
dfCAN_4
## # A tibble: 20 x 6
## YEAR `Arrivals - Thous~ CANADA `Departures - Th~ `Tourism expenditure~
## <chr> <chr> <chr> <chr> <chr>
## 1 1995 16932 <NA> 18206 12658
## 2 1996 17286 <NA> 18973 13815
## 3 1997 17669 <NA> 19111 14151
## 4 1998 18870 <NA> 17648 13384
## 5 1999 19411 <NA> 18368 13999
## 6 2000 19627 <NA> 19182 15125
## 7 2001 19679 <NA> 18359 14634
## 8 2002 20057 <NA> 17705 14257
## 9 2003 17534 <NA> 17739 16309
## 10 2004 19145 <NA> 19595 19267
## 11 2005 18771 <NA> 21099 22739
## 12 2006 18265 <NA> 22731 26067
## 13 2007 17935 <NA> 25163 31099
## 14 2008 17142 <NA> 27034 33844
## 15 2009 15737 <NA> 26204 30065
## 16 2010 16219 <NA> 28680 36975
## 17 2011 16014 <NA> 30450 41234
## 18 2012 16344 <NA> 32276 43010
## 19 2013 16059 <NA> 32971 43092
## 20 2014 16537 <NA> 33518 ..
## # ... with 1 more variable: `Tourism expenditure in the country - US$
## # Mn` <chr>
CANADA column gets renamed to COUNTRY
names(dfCAN_4)[which(colnames(dfUSA) == "CANADA")] <- "COUNTRY"
dfCAN_4
## # A tibble: 20 x 6
## YEAR `Arrivals - Thous~ CANADA `Departures - Th~ `Tourism expenditure~
## <chr> <chr> <chr> <chr> <chr>
## 1 1995 16932 <NA> 18206 12658
## 2 1996 17286 <NA> 18973 13815
## 3 1997 17669 <NA> 19111 14151
## 4 1998 18870 <NA> 17648 13384
## 5 1999 19411 <NA> 18368 13999
## 6 2000 19627 <NA> 19182 15125
## 7 2001 19679 <NA> 18359 14634
## 8 2002 20057 <NA> 17705 14257
## 9 2003 17534 <NA> 17739 16309
## 10 2004 19145 <NA> 19595 19267
## 11 2005 18771 <NA> 21099 22739
## 12 2006 18265 <NA> 22731 26067
## 13 2007 17935 <NA> 25163 31099
## 14 2008 17142 <NA> 27034 33844
## 15 2009 15737 <NA> 26204 30065
## 16 2010 16219 <NA> 28680 36975
## 17 2011 16014 <NA> 30450 41234
## 18 2012 16344 <NA> 32276 43010
## 19 2013 16059 <NA> 32971 43092
## 20 2014 16537 <NA> 33518 ..
## # ... with 1 more variable: `Tourism expenditure in the country - US$
## # Mn` <chr>
…and the COUNTRY column gets assinged “CANADA” value
dfCAN_4$COUNTRY <- "CANADA"
dfCAN_4
## # A tibble: 20 x 7
## YEAR `Arrivals - Thous~ CANADA `Departures - Th~ `Tourism expenditure~
## <chr> <chr> <chr> <chr> <chr>
## 1 1995 16932 <NA> 18206 12658
## 2 1996 17286 <NA> 18973 13815
## 3 1997 17669 <NA> 19111 14151
## 4 1998 18870 <NA> 17648 13384
## 5 1999 19411 <NA> 18368 13999
## 6 2000 19627 <NA> 19182 15125
## 7 2001 19679 <NA> 18359 14634
## 8 2002 20057 <NA> 17705 14257
## 9 2003 17534 <NA> 17739 16309
## 10 2004 19145 <NA> 19595 19267
## 11 2005 18771 <NA> 21099 22739
## 12 2006 18265 <NA> 22731 26067
## 13 2007 17935 <NA> 25163 31099
## 14 2008 17142 <NA> 27034 33844
## 15 2009 15737 <NA> 26204 30065
## 16 2010 16219 <NA> 28680 36975
## 17 2011 16014 <NA> 30450 41234
## 18 2012 16344 <NA> 32276 43010
## 19 2013 16059 <NA> 32971 43092
## 20 2014 16537 <NA> 33518 ..
## # ... with 2 more variables: `Tourism expenditure in the country - US$
## # Mn` <chr>, COUNTRY <chr>
Creating a final data frame for Canada data to be used in analysis
dfCanada <- data.frame(
Year = dfCAN_4$YEAR,
Country = dfCAN_4$COUNTRY,
Arrivals = dfCAN_4$`Arrivals - Thousands` %>% type.convert(na.strings = ".."),
Departures = dfCAN_4$`Departures - Thousands` %>% type.convert(na.strings = ".."),
Expenditure_In = dfCAN_4$`Tourism expenditure in the country - US$ Mn` %>% type.convert(na.strings = ".."),
Expenditure_Out = dfCAN_4$`Tourism expenditure in other countries - US$ Mn` %>% type.convert(na.strings = ".."))
dfCanada
## Year Country Arrivals Departures Expenditure_In Expenditure_Out
## 1 1995 CANADA 16932 18206 9176 12658
## 2 1996 CANADA 17286 18973 10073 13815
## 3 1997 CANADA 17669 19111 10390 14151
## 4 1998 CANADA 18870 17648 11049 13384
## 5 1999 CANADA 19411 18368 12024 13999
## 6 2000 CANADA 19627 19182 13035 15125
## 7 2001 CANADA 19679 18359 12680 14634
## 8 2002 CANADA 20057 17705 12744 14257
## 9 2003 CANADA 17534 17739 12236 16309
## 10 2004 CANADA 19145 19595 15135 19267
## 11 2005 CANADA 18771 21099 15887 22739
## 12 2006 CANADA 18265 22731 16837 26067
## 13 2007 CANADA 17935 25163 17961 31099
## 14 2008 CANADA 17142 27034 18191 33844
## 15 2009 CANADA 15737 26204 15568 30065
## 16 2010 CANADA 16219 28680 18438 36975
## 17 2011 CANADA 16014 30450 19989 41234
## 18 2012 CANADA 16344 32276 20696 43010
## 19 2013 CANADA 16059 32971 20941 43092
## 20 2014 CANADA 16537 33518 NA NA
Perform similar operations for USA data
dfUSA %<>% filter(!COUNTRY %in% c("Travel - US$ Mn", "Passenger transport - US$ Mn", "Inbound tourism", "Outbound tourism")) %>% gather("YEAR", "n", 5:(5+2014-1995))
dfUSA <- dfUSA[,c(-1, -3:-5)]
dfUSA <- spread(dfUSA, COUNTRY, n)
names(dfUSA)[which(colnames(dfUSA) == "UNITED STATES OF AMERICA")] <- "COUNTRY"
dfUSA$COUNTRY <- "USA"
dfUSA <- data.frame(
Year = dfUSA$YEAR,
Country = dfUSA$COUNTRY,
Arrivals = dfUSA$`Arrivals - Thousands` %>% type.convert(na.strings = ".."),
Departures = dfUSA$`Departures - Thousands` %>% type.convert(na.strings = ".."),
Expenditure_In = dfUSA$`Tourism expenditure in the country - US$ Mn` %>% type.convert(na.strings = ".."),
Expenditure_Out = dfUSA$`Tourism expenditure in other countries - US$ Mn` %>% type.convert(na.strings = ".."))
dfUSA
## Year Country Arrivals Departures Expenditure_In Expenditure_Out
## 1 1995 USA 43318 51285 93743 61042
## 2 1996 USA 46489 52999 102196 65477
## 3 1997 USA 47767 53229 107047 71948
## 4 1998 USA 46377 55696 105095 78423
## 5 1999 USA 48509 57222 111475 82513
## 6 2000 USA 51238 61327 120912 91473
## 7 2001 USA 46927 59442 109103 85610
## 8 2002 USA 43581 58066 104427 81860
## 9 2003 USA 41218 56250 101535 82091
## 10 2004 USA 46086 61809 115689 94764
## 11 2005 USA 49206 63503 122077 101421
## 12 2006 USA 50977 63663 126778 106848
## 13 2007 USA 56135 64049 144223 112788
## 14 2008 USA 58007 63653 164721 119838
## 15 2009 USA 55103 62051 146002 102953
## 16 2010 USA 60010 61061 167996 110049
## 17 2011 USA 62821 59209 187629 116448
## 18 2012 USA 66657 60697 200997 129902
## 19 2013 USA 69995 61874 214542 136136
## 20 2014 USA 75011 68303 220757 145678
Comparing Growth of Arrivals over the years between USA and Canada
NOTE: Using Window function cumsum and summarise function to normalize the Arrival numbers for proper [I hope] growth indication.
ggplot(
bind_rows(
mutate(dfCanada, A_Growth = cumsum(Arrivals / summarise(dfCanada, first(Arrivals))[,1] - 1)),
mutate(dfUSA, A_Growth = cumsum(Arrivals / summarise(dfUSA, first(Arrivals))[,1] - 1))
),
mapping = aes(x = Year, y = A_Growth, color = Country, shape = Country)
) +
geom_point() +
theme(axis.text.x = element_text(angle = 90))
#geom_smooth()
#geom_bar()
Comparing Growth of Departures over the years between USA and Canada
ggplot(
bind_rows(
mutate(dfCanada, A_Growth = cumsum(Departures / summarise(dfCanada, first(Departures))[,1] - 1)),
mutate(dfUSA, A_Growth = cumsum(Departures / summarise(dfUSA, first(Departures))[,1] - 1))
),
mapping = aes(x = Year, y = A_Growth, color = Country, shape = Country)
) +
geom_point() +
theme(axis.text.x = element_text(angle = 90))
Comparing Growth of Departures in Canada vs. Growth of Arrivals in USA and finding a correlation
ggplot(
bind_rows(
mutate(dfCanada, A_Growth = cumsum(Departures / summarise(dfCanada, first(Departures))[,1] - 1)),
mutate(dfUSA, A_Growth = cumsum(Arrivals / summarise(dfUSA, first(Arrivals))[,1] - 1))
),
mapping = aes(x = Year, y = A_Growth, color = Country, shape = Country)
) +
geom_point() +
theme(axis.text.x = element_text(angle = 90))
Conclusion
Based on this graph it appears that through the years there is strong indication that people traveling from Canada end up arriving to USA.