library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Part 1 1. If necessary, set your working directory. Read the tourism table and create the cleaned_tourism table.
Tour <-read_rds("tourism.rds")
Continent <- read_rds("country_info.rds")
head(Tour)
## # A tibble: 6 x 23
## A COUNTRY Series `_1995` `_1996` `_1997` `_1998` `_1999` `_2000` `_2001`
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 4 AFGHAN… "" "" "" "" "" "" "" ""
## 2 NA Inboun… "" "" "" "" "" "" "" ""
## 3 NA Arriva… ".." ".." ".." ".." ".." ".." ".." ".."
## 4 NA Touris… "IMF" ".." ".." ".." ".." ".." ".." ".."
## 5 NA Travel… "IMF" ".." ".." ".." ".." ".." ".." ".."
## 6 NA Passen… "IMF" ".." ".." ".." ".." ".." ".." ".."
## # … with 13 more variables: `_2002` <chr>, `_2003` <chr>, `_2004` <chr>,
## # `_2005` <chr>, `_2006` <chr>, `_2007` <chr>, `_2008` <chr>, `_2009` <chr>,
## # `_2010` <chr>, `_2011` <chr>, `_2012` <chr>, `_2013` <chr>, `_2014` <chr>
- Remove the columns _1995 through _2013.
df <- select(Tour, c(A, COUNTRY, Series, "_2014"))
head(df)
## # A tibble: 6 x 4
## A COUNTRY Series `_2014`
## <dbl> <chr> <chr> <chr>
## 1 4 AFGHANISTAN "" ""
## 2 NA Inbound tourism "" ""
## 3 NA Arrivals - Thousands ".." ".."
## 4 NA Tourism expenditure in the country - US$ Mn "IMF" "91"
## 5 NA Travel - US$ Mn "IMF" "82"
## 6 NA Passenger transport - US$ Mn "IMF" "9"
- Create the Country_Name and Tourism_Type columns from values in the Country column. Valid values for Tourism_Type are Inbound tourism and Outbound tourism. Remove rows that contain this labeling information and no other data.
df <- df %>% mutate(COUNTRY_NAME = ifelse(!is.na(A), COUNTRY, NA)) %>%
fill(COUNTRY_NAME) %>%
filter(is.na(A))
head(df)
## # A tibble: 6 x 5
## A COUNTRY Series `_2014` COUNTRY_NAME
## <dbl> <chr> <chr> <chr> <chr>
## 1 NA Inbound tourism "" "" AFGHANISTAN
## 2 NA Arrivals - Thousands ".." ".." AFGHANISTAN
## 3 NA Tourism expenditure in the country - US$ Mn "IMF" "91" AFGHANISTAN
## 4 NA Travel - US$ Mn "IMF" "82" AFGHANISTAN
## 5 NA Passenger transport - US$ Mn "IMF" "9" AFGHANISTAN
## 6 NA Outbound tourism "" "" AFGHANISTAN
##Part 2 > 4. In the Series column, convert values to uppercase and convert “..” or " " to missing value.
df <- df %>%
mutate(Series = toupper(Series)) %>% #Turnings Everything Upper case
mutate(Series = na_if(Series, ".."), #Replaces ".." with NA
Series = na_if(Series, "")) #Replaces "" with NA
- Determine the conversion type (Mn or Thousands) that will be used to calculate values for the new Y2014 column. Hint: You might want to create a new column with this information.
df <- df %>%
mutate(M_or_T = case_when(grepl("Mn", COUNTRY) ~ 1000000,
grepl("Thousand", COUNTRY) ~ 1000))
- In the _2014 column, change the data not available (values of “..” or " ") to a missing value.
df <- df %>%
mutate(Y2014 = as.numeric(`_2014`) * M_or_T)
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
##Part 3
- Create the new Category column from values in the Country column and change the original values to the following valid values: • Arrivals • Departures • Passenger Transport - US$ • Tourism expenditure in other countries - US$ • Tourism expenditure in the country – US$ • Travel - US$
df <- df %>%
mutate(CATEGORY = case_when(COUNTRY == "Arrivals - Thousands" ~ "Arrivals",
COUNTRY == "Departures - Thousands" ~ "Departures",
COUNTRY == "Passenger transport - US$ Mn" ~ "Passenger transport - US$",
COUNTRY == "Tourism expenditure in other countries - US$ Mn" ~ "Tourism expenditure in other countries - US$",
COUNTRY == "Tourism expenditure in the country - US$ Mn" ~ "Tourism expenditure in the country - US$",
COUNTRY == "Travel - US$ Mn" ~ "Travel - US$"))
- Include only Country_Name, Tourism_Type, Category, Series, and Y2014 in the output table.
df <- df %>%
select(-A, -COUNTRY, -`_2014`, -M_or_T)
##Create the Final_Tourism Table
- Create a factor variable for the Continent column that labels continent IDs with the corresponding continent names: 1 = North America 2 = South America 3 = Europe 4 = Africa 5 = Asia 6 = Oceania 7 = Antarctica
head(Continent)
## # A tibble: 6 x 2
## Continent Country
## <dbl> <chr>
## 1 3 ANDORRA
## 2 5 UNITED ARAB EMIRATES
## 3 5 AFGHANISTAN
## 4 1 ANTIGUA AND BARBUDA
## 5 1 ANGUILLA
## 6 3 ALBANIA
- Join the cleaned_tourism table with country_info to create the final_tourism table. Include only matches in the output table. Use the new factor labels for Continent.
Continent <- Continent %>%
mutate(Continent = case_when(Continent == 1 ~ "North America",
Continent == 2 ~ "South America",
Continent == 3 ~ "Europe",
Continent == 4 ~ "Africa",
Continent == 5 ~ "Asia",
Continent == 6 ~ "Oceania",
Continent == 7 ~ "Antarctica"))
final_tourism <- merge(df, Continent, by.x= "COUNTRY_NAME", by.y = "Country") %>%
arrange("COUNTRY_NAME","TOURISM_TYPE")
head(final_tourism)
## COUNTRY_NAME Series Y2014 CATEGORY
## 1 AFGHANISTAN IMF 1.22e+08 Tourism expenditure in other countries - US$
## 2 AFGHANISTAN <NA> NA <NA>
## 3 AFGHANISTAN <NA> NA Departures
## 4 AFGHANISTAN IMF 1.11e+08 Travel - US$
## 5 AFGHANISTAN <NA> NA <NA>
## 6 AFGHANISTAN IMF 1.10e+07 Passenger transport - US$
## Continent
## 1 Asia
## 2 Asia
## 3 Asia
## 4 Asia
## 5 Asia
## 6 Asia
##Create the NoCountryFound Table > Create the NoCountryFound table that has a list of countries from Cleaned_Tourism that are not found in the country_info table. This table should have only one row for each country.
logical<- Continent$Country %in% unique(final_tourism$COUNTRY_NAME)
NoCountryFound <- Continent %>%
filter(!logical)
NoCountryFound
## # A tibble: 41 x 2
## Continent Country
## <chr> <chr>
## 1 Antarctica ANTARCTICA
## 2 North America SAINT BARTHELEMY
## 3 North America BONAIRE, SAINT EUSTATIUS AND SABA
## 4 Antarctica BOUVET ISLAND
## 5 Asia COCOS (KEELING) ISLANDS
## 6 Africa CONGO, THE DEMOCRATIC REPUBLIC OF THE
## 7 Africa CAMEROON/NIGERIA
## 8 Asia CHINA/INDIA
## 9 Asia CHINA/TAIWAN, PROVINCE OF CHINA
## 10 Africa CAPE VERDE
## # … with 31 more rows