Nations HW

Author

Walter Hinkley

Details for Nations Dataset Charts Assignment

#Set working directory and install Nations dataset

setwd("~/Desktop/Data Science 110")
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
nations <- read_csv('nations.csv')
Rows: 5275 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): iso2c, iso3c, country, region, income
dbl (5): year, gdp_percap, population, birth_rate, neonat_mortal_rate

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Create list of Nations of interest

ndf1 <- nations[nations$country =="United States",]
ndf2 <- nations[nations$country == "Italy",]
ndf3 <- nations[nations$country == "Australia",]
ndf4 <- nations[nations$country == "France",]

#Merge list of Nations into 1 data frame

ndf_list <- list(ndf1, ndf2, ndf3, ndf4)
ndf_list1 <- ndf_list %>% reduce(full_join,)
Joining with `by = join_by(iso2c, iso3c, country, year, gdp_percap, population,
birth_rate, neonat_mortal_rate, region, income)`
Joining with `by = join_by(iso2c, iso3c, country, year, gdp_percap, population,
birth_rate, neonat_mortal_rate, region, income)`
Joining with `by = join_by(iso2c, iso3c, country, year, gdp_percap, population,
birth_rate, neonat_mortal_rate, region, income)`
ndf_list1
# A tibble: 100 × 10
   iso2c iso3c country  year gdp_percap population birth_rate neonat_mortal_rate
   <chr> <chr> <chr>   <dbl>      <dbl>      <dbl>      <dbl>              <dbl>
 1 US    USA   United…  2001     37274.  284968955       14.1                4.6
 2 US    USA   United…  2008     48401.  304093966       14                  4.3
 3 US    USA   United…  2002     38166.  287625193       14                  4.6
 4 US    USA   United…  1999     34621.  279040000       14.2                4.6
 5 US    USA   United…  2009     47002.  306771529       13.5                4.2
 6 US    USA   United…  2007     48062.  301231207       14.3                4.3
 7 US    USA   United…  2003     39677.  290107933       14.1                4.6
 8 US    USA   United…  2000     36450.  282162411       14.4                4.6
 9 US    USA   United…  1998     32949.  275854000       14.3                4.7
10 US    USA   United…  1996     30068.  269394000       14.4                4.9
# ℹ 90 more rows
# ℹ 2 more variables: region <chr>, income <chr>

#Create a new variable in the NDF_list1 dataset, GDP($Trillion)

ndf_list2 <- ndf_list1 %>% mutate(gdp_dollars = gdp_percap*population/ 10^12)
ndf_list2
# A tibble: 100 × 11
   iso2c iso3c country  year gdp_percap population birth_rate neonat_mortal_rate
   <chr> <chr> <chr>   <dbl>      <dbl>      <dbl>      <dbl>              <dbl>
 1 US    USA   United…  2001     37274.  284968955       14.1                4.6
 2 US    USA   United…  2008     48401.  304093966       14                  4.3
 3 US    USA   United…  2002     38166.  287625193       14                  4.6
 4 US    USA   United…  1999     34621.  279040000       14.2                4.6
 5 US    USA   United…  2009     47002.  306771529       13.5                4.2
 6 US    USA   United…  2007     48062.  301231207       14.3                4.3
 7 US    USA   United…  2003     39677.  290107933       14.1                4.6
 8 US    USA   United…  2000     36450.  282162411       14.4                4.6
 9 US    USA   United…  1998     32949.  275854000       14.3                4.7
10 US    USA   United…  1996     30068.  269394000       14.4                4.9
# ℹ 90 more rows
# ℹ 3 more variables: region <chr>, income <chr>, gdp_dollars <dbl>

#Create Visualization #1 of GDP Wine Countries

ndf_chart <- ggplot(ndf_list2, aes(x = year, y = gdp_dollars, group=country, color = country))+
  geom_line()+
  geom_point()+
  scale_color_brewer(palette = 'Set1')+
  labs(title = "GDP of Popular Wine producing Countries",
       x = "Year",
       y = "GDP in Trillions of $")

ndf_chart

#Remove na’s, then group by and summarize region and year

natsb <- nations%>%
  mutate(gdp_dollars = gdp_percap*population/ 10^12)
natsb
# A tibble: 5,275 × 11
   iso2c iso3c country  year gdp_percap population birth_rate neonat_mortal_rate
   <chr> <chr> <chr>   <dbl>      <dbl>      <dbl>      <dbl>              <dbl>
 1 AD    AND   Andorra  1996         NA      64291       10.9                2.8
 2 AD    AND   Andorra  1994         NA      62707       10.9                3.2
 3 AD    AND   Andorra  2003         NA      74783       10.3                2  
 4 AD    AND   Andorra  1990         NA      54511       11.9                4.3
 5 AD    AND   Andorra  2009         NA      85474        9.9                1.7
 6 AD    AND   Andorra  2011         NA      82326       NA                  1.6
 7 AD    AND   Andorra  2004         NA      78337       10.9                2  
 8 AD    AND   Andorra  2010         NA      84419        9.8                1.7
 9 AD    AND   Andorra  2001         NA      67770       11.8                2.1
10 AD    AND   Andorra  2002         NA      71046       11.2                2.1
# ℹ 5,265 more rows
# ℹ 3 more variables: region <chr>, income <chr>, gdp_dollars <dbl>
natsc <- natsb%>%
  group_by(region, year)%>%
  summarise(sum_GDP = sum(gdp_dollars, na.rm = TRUE))
`summarise()` has grouped output by 'region'. You can override using the
`.groups` argument.
natsc
# A tibble: 175 × 3
# Groups:   region [7]
   region               year sum_GDP
   <chr>               <dbl>   <dbl>
 1 East Asia & Pacific  1990    5.52
 2 East Asia & Pacific  1991    6.03
 3 East Asia & Pacific  1992    6.50
 4 East Asia & Pacific  1993    7.04
 5 East Asia & Pacific  1994    7.64
 6 East Asia & Pacific  1995    8.29
 7 East Asia & Pacific  1996    8.96
 8 East Asia & Pacific  1997    9.55
 9 East Asia & Pacific  1998    9.60
10 East Asia & Pacific  1999   10.1 
# ℹ 165 more rows

#plot natsc by gdp and year and region is index

ggplot(natsc, aes(x=year, y=sum_GDP, fill = region))+
  geom_area(color = "white", linewidth = .2, alpha = .6)+
  scale_fill_brewer(palette = "Set2")+
  labs(title = "GDP by Regions",
       x = "Year",
       y = "GDP in Trillions of $",
       caption = "World Bank Data")