knitr::opts_chunk$set(echo = TRUE)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
urlfile="https://raw.githubusercontent.com/Nhodgkinson/DATA-607-P2/main/API_NY.GDP.MKTP.CD_DS2_en_csv_v2_4413598.csv"
gdpdata<-read_csv(url(urlfile))
## Rows: 266 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Country Name, Country Code, Indicator Name, Indicator Code
## dbl (62): 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gdpdata<-subset(gdpdata, select=-c(3,4)) #Removing columns indicator name and indicator code as we do not need it
The way the data is currently presented is in a “wide” format. Each year has it’s own column and currently the data has over 60 columns. We want to view the data as “Country Name, Country Code, Year, GDP”. The current problem with the layout is that some countries in the dataset don’t exist during certain years. So the country row populates with a name and then a gdp only for the years it exists, the other values are “NA”. When reordering the data in a tall format I will be able to simply remove the NA values from the year variable.
Soviet Union, USSR, 1991, 2333030 /Soviet Union, USSR, 1992, NA/(Removed)
gdpdata #Let's look at how the data looks before tidying. As stated before, each column contains a year and value. This reduces the amount of rows but spreads it out over a widen area
## # A tibble: 266 × 64
## `Country Name` Count…¹ `1960` `1961` `1962` `1963` `1964` `1965`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aruba ABW NA NA NA NA NA NA
## 2 Africa Eastern… AFE 2.13e10 2.18e10 2.37e10 2.82e10 2.61e10 2.97e10
## 3 Afghanistan AFG 5.38e 8 5.49e 8 5.47e 8 7.51e 8 8.00e 8 1.01e 9
## 4 Africa Western… AFW 1.04e10 1.11e10 1.19e10 1.27e10 1.38e10 1.49e10
## 5 Angola AGO NA NA NA NA NA NA
## 6 Albania ALB NA NA NA NA NA NA
## 7 Andorra AND NA NA NA NA NA NA
## 8 Arab World ARB NA NA NA NA NA NA
## 9 United Arab Em… ARE NA NA NA NA NA NA
## 10 Argentina ARG NA NA 2.45e10 1.83e10 2.56e10 2.83e10
## # … with 256 more rows, 56 more variables: `1966` <dbl>, `1967` <dbl>,
## # `1968` <dbl>, `1969` <dbl>, `1970` <dbl>, `1971` <dbl>, `1972` <dbl>,
## # `1973` <dbl>, `1974` <dbl>, `1975` <dbl>, `1976` <dbl>, `1977` <dbl>,
## # `1978` <dbl>, `1979` <dbl>, `1980` <dbl>, `1981` <dbl>, `1982` <dbl>,
## # `1983` <dbl>, `1984` <dbl>, `1985` <dbl>, `1986` <dbl>, `1987` <dbl>,
## # `1988` <dbl>, `1989` <dbl>, `1990` <dbl>, `1991` <dbl>, `1992` <dbl>,
## # `1993` <dbl>, `1994` <dbl>, `1995` <dbl>, `1996` <dbl>, `1997` <dbl>, …
gdpdf<-gather(gdpdata, "Year", "GDP", 3:64) #Here I gather columns 3-64 into one column named "year" and the values of 3-64 as a new column called "GDP"
gdpdf<-gdpdf[!is.na(gdpdf$GDP),] #I'm removing all NA values from the GDP column. They add no useful information and bloat the data
gdpdf<- gdpdf %>%
arrange(`Country Name`) #arranges the data by country name, A-Z
options(scipen = 100)#removes scientific notation from GDP so we see true values
gdpdf #Now the data has the 4 desired columns, we don't have unnecessary NA values, and we can look at the analysis ask
## # A tibble: 13,118 × 4
## `Country Name` `Country Code` Year GDP
## <chr> <chr> <chr> <dbl>
## 1 Afghanistan AFG 1960 537777811.
## 2 Afghanistan AFG 1961 548888896.
## 3 Afghanistan AFG 1962 546666678.
## 4 Afghanistan AFG 1963 751111191.
## 5 Afghanistan AFG 1964 800000044.
## 6 Afghanistan AFG 1965 1006666638
## 7 Afghanistan AFG 1966 1399999967
## 8 Afghanistan AFG 1967 1673333418
## 9 Afghanistan AFG 1968 1373333367
## 10 Afghanistan AFG 1969 1408888922
## # … with 13,108 more rows
Benjamin, the original poster of this dataset in the weekly discussion post, wanted to see an analysis that looked at the % of GDP change by country year over year. Now we have the data arranged and tidied we can start to do that analysis
gdpdf<-gdpdf %>%
group_by(`Country Name`) %>%
arrange(`Year`, .by_group = TRUE)%>%
mutate(`Percent Change` = (GDP/lag(GDP)-1)*100)
gdpdf
## # A tibble: 13,118 × 5
## # Groups: Country Name [262]
## `Country Name` `Country Code` Year GDP `Percent Change`
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Afghanistan AFG 1960 537777811. NA
## 2 Afghanistan AFG 1961 548888896. 2.07
## 3 Afghanistan AFG 1962 546666678. -0.405
## 4 Afghanistan AFG 1963 751111191. 37.4
## 5 Afghanistan AFG 1964 800000044. 6.51
## 6 Afghanistan AFG 1965 1006666638 25.8
## 7 Afghanistan AFG 1966 1399999967 39.1
## 8 Afghanistan AFG 1967 1673333418 19.5
## 9 Afghanistan AFG 1968 1373333367 -17.9
## 10 Afghanistan AFG 1969 1408888922 2.59
## # … with 13,108 more rows
#I created a formula using "lag". Percent Change is row GDP value divided by lag(GDP)-1, or above row GDP value, multiplied by 100 to give me the percent of change year over year.