Week 8 - ggplot2 https://sites.google.com/site/cit137fall18/week-5
**Assignment**
load tidyr, dplyr, and ggplot2
library(tidyr)
library(dplyr)
library(ggplot2)
library(scales)
#remove scientific notation/formatting
options(scipen=999)
Load data into R from link provided in slides. Check structure.
tour <- read.csv("https://sites.google.com/site/cit137fall18/week-5/tourism_2015.csv", skip = 4, check.names=FALSE)
str(tour)
'data.frame': 264 obs. of 62 variables:
$ Country Name : Factor w/ 264 levels "Afghanistan",..: 11 5 1 6 2 8 250 9 10 4 ...
$ Country Code : Factor w/ 264 levels "ABW","AFG","AGO",..: 1 5 2 3 4 6 7 8 9 10 ...
$ Indicator Name: Factor w/ 1 level "International tourism, receipts (current US$)": 1 1 1 1 1 1 1 1 1 1 ...
$ Indicator Code: Factor w/ 1 level "ST.INT.RCPT.CD": 1 1 1 1 1 1 1 1 1 1 ...
$ 1960 : logi NA NA NA NA NA NA ...
$ 1961 : logi NA NA NA NA NA NA ...
$ 1962 : logi NA NA NA NA NA NA ...
$ 1963 : logi NA NA NA NA NA NA ...
$ 1964 : logi NA NA NA NA NA NA ...
$ 1965 : logi NA NA NA NA NA NA ...
$ 1966 : logi NA NA NA NA NA NA ...
$ 1967 : logi NA NA NA NA NA NA ...
$ 1968 : logi NA NA NA NA NA NA ...
$ 1969 : logi NA NA NA NA NA NA ...
$ 1970 : logi NA NA NA NA NA NA ...
$ 1971 : logi NA NA NA NA NA NA ...
$ 1972 : logi NA NA NA NA NA NA ...
$ 1973 : logi NA NA NA NA NA NA ...
$ 1974 : logi NA NA NA NA NA NA ...
$ 1975 : logi NA NA NA NA NA NA ...
$ 1976 : logi NA NA NA NA NA NA ...
$ 1977 : logi NA NA NA NA NA NA ...
$ 1978 : logi NA NA NA NA NA NA ...
$ 1979 : logi NA NA NA NA NA NA ...
$ 1980 : logi NA NA NA NA NA NA ...
$ 1981 : logi NA NA NA NA NA NA ...
$ 1982 : logi NA NA NA NA NA NA ...
$ 1983 : logi NA NA NA NA NA NA ...
$ 1984 : logi NA NA NA NA NA NA ...
$ 1985 : logi NA NA NA NA NA NA ...
$ 1986 : logi NA NA NA NA NA NA ...
$ 1987 : logi NA NA NA NA NA NA ...
$ 1988 : logi NA NA NA NA NA NA ...
$ 1989 : logi NA NA NA NA NA NA ...
$ 1990 : logi NA NA NA NA NA NA ...
$ 1991 : logi NA NA NA NA NA NA ...
$ 1992 : logi NA NA NA NA NA NA ...
$ 1993 : logi NA NA NA NA NA NA ...
$ 1994 : logi NA NA NA NA NA NA ...
$ 1995 : num 554000000 NA NA 27000000 70400000 ...
$ 1996 : num 666000000 NA NA 38000000 93800000 ...
$ 1997 : num 726000000 NA NA 24000000 33600000 ...
$ 1998 : num 786000000 NA NA 39000000 60230000 ...
$ 1999 : num 782000000 NA NA 31000000 218000000 ...
$ 2000 : num 850000000 NA NA 34000000 398000000 ...
$ 2001 : num 825000000 NA NA 35000000 451000000 ...
$ 2002 : num 835000000 NA NA 51000000 492000000 ...
$ 2003 : num 858100000 NA NA 63000000 537000000 ...
$ 2004 : num 1056000000 NA NA 82000000 756000000 ...
$ 2005 : num 1097000000 NA NA 103000000 880000000 ...
$ 2006 : num 1064100000 NA NA 91000000 1057000000 ...
$ 2007 : num 1213400000 NA NA 236000000 1479000000 ...
$ 2008 : num 1352300000 NA 45000000 293000000 1848000000 ...
$ 2009 : num 1223200000 NA 87000000 554000000 2014000000 ...
$ 2010 : num 1254100000 NA 167000000 726000000 1780000000 ...
$ 2011 : num 1357600000 NA 147000000 653000000 1833000000 ...
$ 2012 : num 1412100000 NA 168000000 711000000 1623000000 ...
$ 2013 : num 1510800000 NA 154000000 1241000000 1670000000 ...
$ 2014 : num 1631900000 NA 91000000 1597000000 1849000000 ...
$ 2015 : logi NA NA NA NA NA NA ...
$ 2016 : logi NA NA NA NA NA NA ...
$ : logi NA NA NA NA NA NA ...
Country Code
, Indicator Name
and Indicator Code
column 62
b/c it doesn’t have a title and causing problemsgather()
to change from wide to longYear
to factor
Country Name
as Country
tour_tidy <- tour %>%
select(-(2:4), -62) %>%
gather(Year, Intl_Tourism_Rcpt_USD, `1960`:`2016`, na.rm = TRUE) %>%
arrange(`Country Name`, Year)
str(tour_tidy)
'data.frame': 4542 obs. of 3 variables:
$ Country Name : Factor w/ 264 levels "Afghanistan",..: 1 1 1 1 1 1 1 2 2 2 ...
$ Year : chr "2008" "2009" "2010" "2011" ...
$ Intl_Tourism_Rcpt_USD: num 45000000 87000000 167000000 147000000 168000000 154000000 91000000 70400000 93800000 33600000 ...
head(tour_tidy)
tail(tour_tidy)
# change to tibble
tour_tidy <- tbl_df(tour_tidy)
str(tour_tidy)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 4542 obs. of 3 variables:
$ Country Name : Factor w/ 264 levels "Afghanistan",..: 1 1 1 1 1 1 1 2 2 2 ...
$ Year : chr "2008" "2009" "2010" "2011" ...
$ Intl_Tourism_Rcpt_USD: num 45000000 87000000 167000000 147000000 168000000 154000000 91000000 70400000 93800000 33600000 ...
# change `Country Name` to Country
names(tour_tidy)[1] <- "Country"
names(tour_tidy)
[1] "Country" "Year" "Intl_Tourism_Rcpt_USD"
Make geom_line()
chart comparing tourism dollars over time of the three countries with a scale_y_log10()
. Make tour_tidy
a function tourism_fun
wherein assigning countries to the function argument updates the chart
# function name: tourism_fun
tourism_fun <- function(...) {
tour_tidy %>%
filter(Country %in% c(...)) %>%
ggplot(aes(x = Year, y = Intl_Tourism_Rcpt_USD, col = Country, group = Country)) +
geom_line() +
scale_y_log10(labs(y = "Intl Tourism, Current USD"), labels = comma)
}
#str(tourism_fun)
class(tourism_fun)
[1] "function"
tourism_fun
function(...) {
tour_tidy %>%
filter(Country %in% c(...)) %>%
ggplot(aes(x = Year, y = Intl_Tourism_Rcpt_USD, col = Country, group = Country)) +
geom_line() +
scale_y_log10(labs(y = "Intl Tourism, Current USD"), labels = comma)
}
# assign countries to object
country_choose <- c("Netherlands", "Moldova", "Luxembourg", "Kyrgyz Republic")
# run function with object
tourism_fun(country_choose)
# argument set to "..." means object is undefined and therefore flexible
tourism_countries <- c("Zimbabwe", "Togo", "Brazil", "Peru")
# run function with object
tourism_fun(tourism_countries)