knitr::opts_chunk$set(echo = TRUE)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)

urlfile="https://raw.githubusercontent.com/Nhodgkinson/DATA-607-P2/main/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_4413598.csv"

gdpdata<-read_csv(url(urlfile))
## Rows: 266 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Country Name, Country Code, Indicator Name, Indicator Code
## dbl (62): 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gdpdata<-subset(gdpdata, select=-c(3,4)) #Removing columns indicator name and indicator code as we do not need it 

Tidying

The way the data is currently presented is in a “wide” format. Each year has it’s own column and currently the data has over 60 columns. We want to view the data as “Country Name, Country Code, Year, GDP”. The current problem with the layout is that some countries in the dataset don’t exist during certain years. So the country row populates with a name and then a gdp only for the years it exists, the other values are “NA”. When reordering the data in a tall format I will be able to simply remove the NA values from the year variable.

Soviet Union, USSR, 1991, 2333030 /Soviet Union, USSR, 1992, NA/(Removed)

gdpdata #Let's look at how the data looks before tidying. As stated before, each column contains a year and value. This reduces the amount of rows but spreads it out over a widen area
## # A tibble: 266 × 64
##    `Country Name`  Count…¹   `1960`   `1961`   `1962`   `1963`   `1964`   `1965`
##    <chr>           <chr>      <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
##  1 Aruba           ABW     NA       NA       NA       NA       NA       NA      
##  2 Africa Eastern… AFE      2.13e10  2.18e10  2.37e10  2.82e10  2.61e10  2.97e10
##  3 Afghanistan     AFG      5.38e 8  5.49e 8  5.47e 8  7.51e 8  8.00e 8  1.01e 9
##  4 Africa Western… AFW      1.04e10  1.11e10  1.19e10  1.27e10  1.38e10  1.49e10
##  5 Angola          AGO     NA       NA       NA       NA       NA       NA      
##  6 Albania         ALB     NA       NA       NA       NA       NA       NA      
##  7 Andorra         AND     NA       NA       NA       NA       NA       NA      
##  8 Arab World      ARB     NA       NA       NA       NA       NA       NA      
##  9 United Arab Em… ARE     NA       NA       NA       NA       NA       NA      
## 10 Argentina       ARG     NA       NA        2.45e10  1.83e10  2.56e10  2.83e10
## # … with 256 more rows, 56 more variables: `1966` <dbl>, `1967` <dbl>,
## #   `1968` <dbl>, `1969` <dbl>, `1970` <dbl>, `1971` <dbl>, `1972` <dbl>,
## #   `1973` <dbl>, `1974` <dbl>, `1975` <dbl>, `1976` <dbl>, `1977` <dbl>,
## #   `1978` <dbl>, `1979` <dbl>, `1980` <dbl>, `1981` <dbl>, `1982` <dbl>,
## #   `1983` <dbl>, `1984` <dbl>, `1985` <dbl>, `1986` <dbl>, `1987` <dbl>,
## #   `1988` <dbl>, `1989` <dbl>, `1990` <dbl>, `1991` <dbl>, `1992` <dbl>,
## #   `1993` <dbl>, `1994` <dbl>, `1995` <dbl>, `1996` <dbl>, `1997` <dbl>, …
gdpdf<-gather(gdpdata, "Year", "GDP", 3:64) #Here I gather columns 3-64 into one column named "year" and the values of 3-64 as a new column called "GDP"

gdpdf<-gdpdf[!is.na(gdpdf$GDP),] #I'm removing all NA values from the GDP column. They add no useful information and bloat the data

gdpdf<- gdpdf %>%
  arrange(`Country Name`)  #arranges the data by country name, A-Z
  
options(scipen = 100)#removes scientific notation from GDP so we see true values

gdpdf #Now the data has the 4 desired columns, we don't have unnecessary NA values, and we can look at the analysis ask
## # A tibble: 13,118 × 4
##    `Country Name` `Country Code` Year          GDP
##    <chr>          <chr>          <chr>       <dbl>
##  1 Afghanistan    AFG            1960   537777811.
##  2 Afghanistan    AFG            1961   548888896.
##  3 Afghanistan    AFG            1962   546666678.
##  4 Afghanistan    AFG            1963   751111191.
##  5 Afghanistan    AFG            1964   800000044.
##  6 Afghanistan    AFG            1965  1006666638 
##  7 Afghanistan    AFG            1966  1399999967 
##  8 Afghanistan    AFG            1967  1673333418 
##  9 Afghanistan    AFG            1968  1373333367 
## 10 Afghanistan    AFG            1969  1408888922 
## # … with 13,108 more rows

Analysis

Benjamin, the original poster of this dataset in the weekly discussion post, wanted to see an analysis that looked at the % of GDP change by country year over year. Now we have the data arranged and tidied we can start to do that analysis

gdpdf<-gdpdf %>%
  group_by(`Country Name`) %>%
  arrange(`Year`, .by_group = TRUE)%>%
  mutate(`Percent Change` = (GDP/lag(GDP)-1)*100)
gdpdf
## # A tibble: 13,118 × 5
## # Groups:   Country Name [262]
##    `Country Name` `Country Code` Year          GDP `Percent Change`
##    <chr>          <chr>          <chr>       <dbl>            <dbl>
##  1 Afghanistan    AFG            1960   537777811.           NA    
##  2 Afghanistan    AFG            1961   548888896.            2.07 
##  3 Afghanistan    AFG            1962   546666678.           -0.405
##  4 Afghanistan    AFG            1963   751111191.           37.4  
##  5 Afghanistan    AFG            1964   800000044.            6.51 
##  6 Afghanistan    AFG            1965  1006666638            25.8  
##  7 Afghanistan    AFG            1966  1399999967            39.1  
##  8 Afghanistan    AFG            1967  1673333418            19.5  
##  9 Afghanistan    AFG            1968  1373333367           -17.9  
## 10 Afghanistan    AFG            1969  1408888922             2.59 
## # … with 13,108 more rows
#I created a formula using "lag". Percent Change is row GDP value divided by lag(GDP)-1, or above row GDP value, multiplied by 100 to give me the percent of change year over year.