WK6_Nation Dataset homework

library(tidyverse)  # I just noticed tidyverse is needed everywhere!!

## Warning: package 'tidyverse' was built under R version 4.0.5

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.0.5

## Warning: package 'tibble' was built under R version 4.0.5

## Warning: package 'tidyr' was built under R version 4.0.5

## Warning: package 'readr' was built under R version 4.0.5

## Warning: package 'dplyr' was built under R version 4.0.5

## Warning: package 'forcats' was built under R version 4.0.5

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

setwd('C:/Users/Ma Family/Documents/R/DATA110/week6')
nations <- read_csv("nations.csv")

## Rows: 5275 Columns: 10
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): iso2c, iso3c, country, region, income
## dbl (5): year, gdp_percap, population, birth_rate, neonat_mortal_rate
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

✽ Use dplyr and ggplot2 to process data and draw these two charts from the Nations dataset.

library(dplyr)
library(ggplot2)

✽ For both charts, you will first need to create a new variable in the data, using mutate from dplyr, givin the GDP of each country in trillions of dollars, by multiplying gdp_percap by population and dividing by a trillion.

nations <- nations %>% 
  mutate(gdp = (gdp_percap * population)/10^12) 
nations

## # A tibble: 5,275 x 11
##    iso2c iso3c country  year gdp_percap population birth_rate neonat_mortal_rate
##    <chr> <chr> <chr>   <dbl>      <dbl>      <dbl>      <dbl>              <dbl>
##  1 AD    AND   Andorra  1996         NA      64291       10.9                2.8
##  2 AD    AND   Andorra  1994         NA      62707       10.9                3.2
##  3 AD    AND   Andorra  2003         NA      74783       10.3                2  
##  4 AD    AND   Andorra  1990         NA      54511       11.9                4.3
##  5 AD    AND   Andorra  2009         NA      85474        9.9                1.7
##  6 AD    AND   Andorra  2011         NA      82326       NA                  1.6
##  7 AD    AND   Andorra  2004         NA      78337       10.9                2  
##  8 AD    AND   Andorra  2010         NA      84419        9.8                1.7
##  9 AD    AND   Andorra  2001         NA      67770       11.8                2.1
## 10 AD    AND   Andorra  2002         NA      71046       11.2                2.1
## # ... with 5,265 more rows, and 3 more variables: region <chr>, income <chr>,
## #   gdp <dbl>

# names(nations)

⇨ Now, I selected 3 countries from East Asia and the United States.

#a.usa <- nations[grep("tates", nations$country), ]      # find a string including particular letters in data frame.

four <- nations %>% filter(country == "Korea, Rep."|
                            country == "China"|
                            country == "Japan"|
                            country == "United States" )
table(four$country)

## 
##         China         Japan   Korea, Rep. United States 
##            25            25            25            25

names(four)

##  [1] "iso2c"              "iso3c"              "country"           
##  [4] "year"               "gdp_percap"         "population"        
##  [7] "birth_rate"         "neonat_mortal_rate" "region"            
## [10] "income"             "gdp"

⇨ I checked if there are NAs in the data frame.

sum(is.na.data.frame(four))

## [1] 0

✽ Draw both charts with ggplot2.

✽ For the first chart, you will need to filter the data with dplyr for the four desired countries. When making the chart with ggplot2 you will need to add both geom_point and geom_line layers, and use the Set1 ColorBrewer palette using: scale_color_brewer(palette = “Set1”).

⇨ First, create a new dataframe grouped by country

four_gdp <- four %>%
  group_by(country) 
four_gdp

## # A tibble: 100 x 11
## # Groups:   country [4]
##    iso2c iso3c country  year gdp_percap population birth_rate neonat_mortal_rate
##    <chr> <chr> <chr>   <dbl>      <dbl>      <dbl>      <dbl>              <dbl>
##  1 CN    CHN   China    1992      1260. 1164970000       18.3               29.4
##  2 CN    CHN   China    2005      5053. 1303720000       12.4               14  
##  3 CN    CHN   China    2000      2915. 1262645000       14.0               21.2
##  4 CN    CHN   China    1991      1091. 1150780000       19.7               29.7
##  5 CN    CHN   China    2013     12219. 1357380000       12.1                6.3
##  6 CN    CHN   China    1999      2650. 1252735000       14.6               22.2
##  7 CN    CHN   China    2014     13255. 1364270000       12.4                5.9
##  8 CN    CHN   China    2003      3934. 1288400000       12.4               17.1
##  9 CN    CHN   China    2004      4423. 1296075000       12.3               15.5
## 10 CN    CHN   China    1993      1453. 1178440000       18.1               28.8
## # ... with 90 more rows, and 3 more variables: region <chr>, income <chr>,
## #   gdp <dbl>

⇨ Second, make a interactive scattor plot using ggplot() and ggplotly()

library(plotly)

## Warning: package 'plotly' was built under R version 4.0.5

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

p <- ggplot(data = four_gdp, aes(x = year, y = gdp, col = country)) + 
  geom_line() + geom_point() +
  scale_color_brewer(palette="Set1") +
  labs(title = "China's Rise to Become the Largest Economy", x = "Year", y = "GDP ($trillion)") +     # Add title, X and Y axis lables
  theme(plot.title=element_text(size=18,color="steelblue", lineheight=1.2), axis.title.x=element_text(size=13, color = "salmon"), axis.title.y=element_text(size=13, color = "salmon"), axis.text.x=element_text(size=10), axis.text.y=element_text(size=10))    # Change title, X and Y labels' sizes, colors

ggplotly(p)

four %>% filter(year == 2014) %>% select(country, gdp_percap) %>% head

## # A tibble: 4 x 2
##   country       gdp_percap
##   <chr>              <dbl>
## 1 China             13255.
## 2 Japan             36577.
## 3 Korea, Rep.       33417.
## 4 United States     54398.

⇨ Korea’s GDP looks like it had increased very little for 25 years. However, GDP per capita of Korea has increased a lot and now it is around 2.5 times China’s and approached Japan’s GDP per capita. But the population of Korea is relatively small and we can find out that Korea’s economy relies largely on the other countries’ economies, for example, especially China.

✽ For the second chart, using dplyr you will need to group_by region and year, and then summarize on your mutated value for gdp using summarise(GDP = sum(gdp, na.rm = TRUE)). (There will be null values, or NAs, in this data, so you will need to use na.rm = TRUE).

regions <- nations %>%
  group_by(region, year) %>%
  summarise(GDP = sum(gdp, na.rm = TRUE))

## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.

regions

## # A tibble: 175 x 3
## # Groups:   region [7]
##    region               year   GDP
##    <chr>               <dbl> <dbl>
##  1 East Asia & Pacific  1990  5.52
##  2 East Asia & Pacific  1991  6.03
##  3 East Asia & Pacific  1992  6.50
##  4 East Asia & Pacific  1993  7.04
##  5 East Asia & Pacific  1994  7.64
##  6 East Asia & Pacific  1995  8.29
##  7 East Asia & Pacific  1996  8.96
##  8 East Asia & Pacific  1997  9.55
##  9 East Asia & Pacific  1998  9.60
## 10 East Asia & Pacific  1999 10.1 
## # ... with 165 more rows

✽ Each region’s area will be generated by the command geom_area ()

✽ When drawing the chart with ggplot2, you will need to use the Set2 ColorBrewer palette using scale_fill_brewer(palette = “Set2”)

✽ Think about the difference between fill and color when making the chart, and where the above fill command needs to go in order for the regions to fill with the different colors when making the chart, and put a very thin white line around each area.

ggplot(data = regions, aes(x = year, y = GDP, fill = region)) +   # The fill is to fill different colors to each area.
  geom_area(color = "white", size = .2, alpha = 0.75) +      # The color in geom_area() is the line color, not area color. 
  scale_fill_brewer(palette = "Set2") +  
  labs(title="GDP by World Bank Region" , y="GDP ($trillion)")

⇨ Let’s try to apply a different themed appearance to the chart.

library(viridis)

## Warning: package 'viridis' was built under R version 4.0.5

## Loading required package: viridisLite

## Warning: package 'viridisLite' was built under R version 4.0.5

library(hrbrthemes)

## Warning: package 'hrbrthemes' was built under R version 4.0.5

## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.

##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and

##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow

ggplot(data = regions, aes(x = year, y = GDP, fill = region)) +   
  geom_area(color = "white", alpha = 0.6, size = .5) +    
  scale_fill_viridis(discrete = T) +
  theme_ipsum() +
  labs(title="GDP by World Bank Region" , y="GDP($trillion)")

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database