library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Create the Cleaned_Tourism Table

Part 1 1. If necessary, set your working directory. Read the tourism table and create the cleaned_tourism table.

Tour <-read_rds("tourism.rds")
Continent <- read_rds("country_info.rds")
head(Tour) 
## # A tibble: 6 x 23
##       A COUNTRY Series `_1995` `_1996` `_1997` `_1998` `_1999` `_2000` `_2001`
##   <dbl> <chr>   <chr>  <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
## 1     4 AFGHAN… ""     ""      ""      ""      ""      ""      ""      ""     
## 2    NA Inboun… ""     ""      ""      ""      ""      ""      ""      ""     
## 3    NA Arriva… ".."   ".."    ".."    ".."    ".."    ".."    ".."    ".."   
## 4    NA Touris… "IMF"  ".."    ".."    ".."    ".."    ".."    ".."    ".."   
## 5    NA Travel… "IMF"  ".."    ".."    ".."    ".."    ".."    ".."    ".."   
## 6    NA Passen… "IMF"  ".."    ".."    ".."    ".."    ".."    ".."    ".."   
## # … with 13 more variables: `_2002` <chr>, `_2003` <chr>, `_2004` <chr>,
## #   `_2005` <chr>, `_2006` <chr>, `_2007` <chr>, `_2008` <chr>, `_2009` <chr>,
## #   `_2010` <chr>, `_2011` <chr>, `_2012` <chr>, `_2013` <chr>, `_2014` <chr>
  1. Remove the columns _1995 through _2013.
df <- select(Tour, c(A, COUNTRY, Series, "_2014"))

head(df)
## # A tibble: 6 x 4
##       A COUNTRY                                     Series `_2014`
##   <dbl> <chr>                                       <chr>  <chr>  
## 1     4 AFGHANISTAN                                 ""     ""     
## 2    NA Inbound tourism                             ""     ""     
## 3    NA Arrivals - Thousands                        ".."   ".."   
## 4    NA Tourism expenditure in the country - US$ Mn "IMF"  "91"   
## 5    NA Travel - US$ Mn                             "IMF"  "82"   
## 6    NA Passenger transport - US$ Mn                "IMF"  "9"
  1. Create the Country_Name and Tourism_Type columns from values in the Country column. Valid values for Tourism_Type are Inbound tourism and Outbound tourism. Remove rows that contain this labeling information and no other data.
df <- df %>% mutate(COUNTRY_NAME = ifelse(!is.na(A), COUNTRY, NA)) %>% 
  fill(COUNTRY_NAME) %>%
  filter(is.na(A))
head(df)
## # A tibble: 6 x 5
##       A COUNTRY                                     Series `_2014` COUNTRY_NAME
##   <dbl> <chr>                                       <chr>  <chr>   <chr>       
## 1    NA Inbound tourism                             ""     ""      AFGHANISTAN 
## 2    NA Arrivals - Thousands                        ".."   ".."    AFGHANISTAN 
## 3    NA Tourism expenditure in the country - US$ Mn "IMF"  "91"    AFGHANISTAN 
## 4    NA Travel - US$ Mn                             "IMF"  "82"    AFGHANISTAN 
## 5    NA Passenger transport - US$ Mn                "IMF"  "9"     AFGHANISTAN 
## 6    NA Outbound tourism                            ""     ""      AFGHANISTAN

##Part 2 > 4. In the Series column, convert values to uppercase and convert “..” or " " to missing value.

df <- df %>% 
  mutate(Series = toupper(Series)) %>% #Turnings Everything Upper case
  mutate(Series = na_if(Series, ".."), #Replaces ".." with NA
         Series = na_if(Series, "")) #Replaces "" with NA
  1. Determine the conversion type (Mn or Thousands) that will be used to calculate values for the new Y2014 column. Hint: You might want to create a new column with this information.
df <- df %>% 
  mutate(M_or_T = case_when(grepl("Mn", COUNTRY) ~ 1000000, 
                            grepl("Thousand", COUNTRY) ~ 1000)) 
  1. In the _2014 column, change the data not available (values of “..” or " ") to a missing value.
df <- df %>% 
  mutate(Y2014 = as.numeric(`_2014`) * M_or_T)
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion

##Part 3

  1. Create the new Category column from values in the Country column and change the original values to the following valid values: • Arrivals • Departures • Passenger Transport - US$ • Tourism expenditure in other countries - US$ • Tourism expenditure in the country – US$ • Travel - US$
df <- df %>% 
  mutate(CATEGORY = case_when(COUNTRY == "Arrivals - Thousands" ~ "Arrivals",
                              COUNTRY == "Departures - Thousands" ~ "Departures",
                              COUNTRY == "Passenger transport - US$ Mn" ~ "Passenger transport - US$",
                              COUNTRY == "Tourism expenditure in other countries - US$ Mn" ~ "Tourism expenditure in other countries - US$",
                              COUNTRY == "Tourism expenditure in the country - US$ Mn" ~ "Tourism expenditure in the country - US$",
                              COUNTRY == "Travel - US$ Mn" ~ "Travel - US$"))
  1. Include only Country_Name, Tourism_Type, Category, Series, and Y2014 in the output table.
df <- df %>% 
  select(-A, -COUNTRY, -`_2014`, -M_or_T)

##Create the Final_Tourism Table

  1. Create a factor variable for the Continent column that labels continent IDs with the corresponding continent names: 1 = North America 2 = South America 3 = Europe 4 = Africa 5 = Asia 6 = Oceania 7 = Antarctica
head(Continent)
## # A tibble: 6 x 2
##   Continent Country             
##       <dbl> <chr>               
## 1         3 ANDORRA             
## 2         5 UNITED ARAB EMIRATES
## 3         5 AFGHANISTAN         
## 4         1 ANTIGUA AND BARBUDA 
## 5         1 ANGUILLA            
## 6         3 ALBANIA
  1. Join the cleaned_tourism table with country_info to create the final_tourism table. Include only matches in the output table. Use the new factor labels for Continent.
Continent <- Continent %>% 
  mutate(Continent = case_when(Continent == 1 ~ "North America",
                               Continent == 2 ~ "South America",
                               Continent == 3 ~ "Europe",
                               Continent == 4 ~ "Africa",
                               Continent == 5 ~ "Asia",
                               Continent == 6 ~ "Oceania",
                               Continent == 7 ~ "Antarctica"))
final_tourism <- merge(df, Continent, by.x= "COUNTRY_NAME", by.y = "Country") %>%
  arrange("COUNTRY_NAME","TOURISM_TYPE")
  

head(final_tourism)
##   COUNTRY_NAME Series    Y2014                                     CATEGORY
## 1  AFGHANISTAN    IMF 1.22e+08 Tourism expenditure in other countries - US$
## 2  AFGHANISTAN   <NA>       NA                                         <NA>
## 3  AFGHANISTAN   <NA>       NA                                   Departures
## 4  AFGHANISTAN    IMF 1.11e+08                                 Travel - US$
## 5  AFGHANISTAN   <NA>       NA                                         <NA>
## 6  AFGHANISTAN    IMF 1.10e+07                    Passenger transport - US$
##   Continent
## 1      Asia
## 2      Asia
## 3      Asia
## 4      Asia
## 5      Asia
## 6      Asia

##Create the NoCountryFound Table > Create the NoCountryFound table that has a list of countries from Cleaned_Tourism that are not found in the country_info table. This table should have only one row for each country.

logical<- Continent$Country %in% unique(final_tourism$COUNTRY_NAME) 
NoCountryFound <- Continent %>% 
  filter(!logical)
NoCountryFound
## # A tibble: 41 x 2
##    Continent     Country                              
##    <chr>         <chr>                                
##  1 Antarctica    ANTARCTICA                           
##  2 North America SAINT BARTHELEMY                     
##  3 North America BONAIRE, SAINT EUSTATIUS AND SABA    
##  4 Antarctica    BOUVET ISLAND                        
##  5 Asia          COCOS (KEELING) ISLANDS              
##  6 Africa        CONGO, THE DEMOCRATIC REPUBLIC OF THE
##  7 Africa        CAMEROON/NIGERIA                     
##  8 Asia          CHINA/INDIA                          
##  9 Asia          CHINA/TAIWAN, PROVINCE OF CHINA      
## 10 Africa        CAPE VERDE                           
## # … with 31 more rows