DATA 607 Project2_1 Homework

Global annual GDP growth numbers by country from 1960 - 2020

Data Reference Link: data.worldbank.org

Let’s load the required libraries in R for data analysis
library(dplyr)
library(downloader)
library(stringr)
library(htmlTable)
library(tidyverse)


Download and read the CSV file to a dataframe object
View the data
# Import the data from the csv file
gdp_data <- read.csv("https://raw.githubusercontent.com/baruab/msdsrepo/main/DATA-607/GDP_by_country.csv", stringsAsFactors = FALSE)

head(gdp_data, 5)  
##               ï..Country.Name Country.Code        Indicator.Name
## 1                       Aruba          ABW GDP growth (annual %)
## 2 Africa Eastern and Southern          AFE GDP growth (annual %)
## 3                 Afghanistan          AFG GDP growth (annual %)
## 4  Africa Western and Central          AFW GDP growth (annual %)
## 5                      Angola          AGO GDP growth (annual %)
##      Indicator.Code X1960    X1961    X1962    X1963    X1964    X1965
## 1 NY.GDP.MKTP.KD.ZG    NA       NA       NA       NA       NA       NA
## 2 NY.GDP.MKTP.KD.ZG    NA 1.063696 7.453563 5.740520 5.473950 5.594137
## 3 NY.GDP.MKTP.KD.ZG    NA       NA       NA       NA       NA       NA
## 4 NY.GDP.MKTP.KD.ZG    NA 1.898596 3.816073 7.040888 5.233236 4.175162
## 5 NY.GDP.MKTP.KD.ZG    NA       NA       NA       NA       NA       NA
##       X1966     X1967    X1968     X1969     X1970     X1971    X1972    X1973
## 1        NA        NA       NA        NA        NA        NA       NA       NA
## 2  4.058715  5.813018 4.046609  5.178724  4.855967  5.100963 2.203884 4.458723
## 3        NA        NA       NA        NA        NA        NA       NA       NA
## 4 -1.796436 -9.401674 1.422819 15.107822 17.618959 10.628708 3.212971 4.100759
## 5        NA        NA       NA        NA        NA        NA       NA       NA
##       X1974     X1975    X1976     X1977     X1978    X1979    X1980     X1981
## 1        NA        NA       NA        NA        NA       NA       NA        NA
## 2  5.839322  1.421272 2.444660 0.7431271  1.647541 3.174390 5.707294  4.181481
## 3        NA        NA       NA        NA        NA       NA       NA        NA
## 4 10.533280 -1.908240 8.773445 4.3362697 -2.555202 5.192982 2.278913 -6.635271
## 5        NA        NA       NA        NA        NA       NA       NA -4.400001
##        X1982      X1983    X1984      X1985    X1986     X1987     X1988
## 1         NA         NA       NA         NA       NA 16.078431 18.648649
## 2  0.2044245 -0.1678771 3.599962 -0.3106704 1.801654  3.626758  4.244825
## 3         NA         NA       NA         NA       NA        NA        NA
## 4 -3.0906407 -6.1737644 0.766286  5.4873474 1.338868  1.205109  4.906503
## 5  0.0000000  4.2000014 6.000002  3.4999995 2.900002  4.082749  6.128890
##         X1989       X1990       X1991     X1992       X1993      X1994
## 1 12.12984055  3.96140173  7.96287250  5.882353   7.3076923  8.2039028
## 2  2.64672228  0.05297704 -0.08690589 -2.155483  -0.6660328  2.0872612
## 3          NA          NA          NA        NA          NA         NA
## 4  2.32247485  6.43720769  1.22080653  2.683972  -1.1609721 -0.2260965
## 5  0.04162146 -3.45009868  0.99135930 -5.838281 -23.9834174  1.3393634
##       X1995     X1996    X1997    X1998    X1999    X2000     X2001     X2002
## 1  2.547144  1.185788 7.046874 1.991986 1.238042 7.616588 -2.971257 -3.273646
## 2  4.308948  5.410609 3.433427 1.657682 2.672356 3.407952  3.385073  4.077465
## 3        NA        NA       NA       NA       NA       NA        NA        NA
## 4  2.011852  4.596463 3.828704 3.606729 1.403042 3.611657  5.667418  9.930416
## 5 15.000000 13.544370 7.274277 4.691146 2.181490 3.054624  4.205999 13.665687
##      X2003     X2004     X2005     X2006     X2007       X2008       X2009
## 1 1.975547  7.911563  1.214349  1.050608  1.800226 -0.09070805 -10.5197485
## 2 3.156648  5.423484  6.312341  6.832111  7.104249  4.79681982   1.0394013
## 3 8.832278  1.414118 11.229715  5.357403 13.826320  3.92498382  21.3905284
## 4 5.873041  8.017312  6.005428  5.257805  5.588151  6.17543276   6.1454385
## 5 2.989850 10.952862 15.028915 11.547683 14.010018 11.16613833   0.8587126
##       X2010     X2011     X2012    X2013    X2014     X2015        X2016
## 1 -3.685029 3.4460548 -1.369863 4.198232 0.300000 5.7000009  2.099999586
## 2  4.809783 4.2019919  3.240976 4.470306 4.090918 2.7632428  2.004538890
## 3 14.362441 0.4263548 12.752287 5.600745 2.724543 1.4513147  2.260314201
## 4  6.641963 5.0048745  5.272612 5.831383 5.833539 2.7351660 -0.001994532
## 5  4.403933 3.4719763  8.542188 4.954545 4.822628 0.9435716 -2.580049644
##        X2017     X2018      X2019      X2020  X
## 1  1.9999991        NA         NA         NA NA
## 2  2.8322876  2.385829  1.6729245 -3.5751725 NA
## 3  2.6470032  1.189228  3.9116034 -1.9347782 NA
## 4  2.1637475  2.831539  3.1474817 -0.9789222 NA
## 5 -0.1472129 -2.003630 -0.6246443 -4.0405100 NA
Remove unwanted columns
gdp_data2  <- gdp_data %>% mutate_if(is.numeric, round, digits=2)
subset_gdp_data <- gdp_data2 %>% select(-'Indicator.Name',-'Indicator.Code')
Remove the letter ‘X’ from the column names
Rename the first column of the data frame
names(subset_gdp_data) <- gsub("X", "", names(subset_gdp_data))

subset_gdp_data <-  subset_gdp_data %>% {colnames(.)[1] = "country"; .}
Load latitude/longitude data
# Import the country latitude/longitude data from the csv file
country_latlng_data <- read.csv("https://raw.githubusercontent.com/baruab/msdsrepo/main/DATA-607/country_latitude_longitude.csv")
head(country_latlng_data)
##   country_code latitude  longitude              country usa_state_code
## 1           AD 42.54624   1.601554              Andorra             AK
## 2           AE 23.42408  53.847818 United Arab Emirates             AL
## 3           AF 33.93911  67.709953          Afghanistan             AR
## 4           AG 17.06082 -61.796428  Antigua and Barbuda             AZ
## 5           AI 18.22055 -63.068615             Anguilla             CA
## 6           AL 41.15333  20.168331              Albania             CO
##   usa_state_latitude usa_state_longitude  usa_state
## 1           63.58875          -154.49306     Alaska
## 2           32.31823           -86.90230    Alabama
## 3           35.20105           -91.83183   Arkansas
## 4           34.04893          -111.09373    Arizona
## 5           36.77826          -119.41793 California
## 6           39.55005          -105.78207   Colorado
Select relevant columns from the country geo dataset
Re-order the columns in the dataframe
subset_latlng_data <- subset(country_latlng_data, select= c('latitude', 'longitude', 'country'))

col_order <- c("country","latitude","longitude")
new_latlng_data <- subset_latlng_data[, col_order]
Join the latitude/longitude with GDP columns in the dataframe
joined_df <- left_join( new_latlng_data,subset_gdp_data, by ="country")

Selecting few columns to work with

Add the GDP for last 5 years, create a new column with the total GDP

Filter out the countries which have not reported their GDP every year

### Selecting few columns

sel_df <-  select(joined_df, country,'2016', '2017', '2018', '2019' ,'2020', latitude, longitude) %>%
  mutate(total =  joined_df$'2016' + joined_df$'2017'+  joined_df$'2018'+ joined_df$'2019' + joined_df$'2020') %>% filter(!is.na(total))

Sort the countries by their total GDP

Display the top 10 performing countries from (2016 - 2020)

top_10_countries <-  sel_df %>% arrange(desc(total)) %>% head(10) 
knitr::kable(top_10_countries, "html")
country 2016 2017 2018 2019 2020 latitude longitude total
Guyana 3.81 3.73 4.44 5.35 43.48 4.860416 -58.930180 60.81
Ethiopia 9.43 9.56 6.82 8.36 6.06 9.145000 40.489673 40.23
Guinea 10.82 10.30 6.36 5.65 6.99 9.945587 -9.696645 40.12
Tajikistan 6.90 7.10 7.60 7.40 4.50 38.861034 71.276093 33.50
Bangladesh 7.11 7.28 7.86 8.15 2.38 23.684994 90.356331 32.78
Vietnam 6.21 6.81 7.08 7.02 2.91 14.058324 108.277199 30.03
China 6.85 6.95 6.75 5.95 2.30 35.861660 104.195397 28.80
Djibouti 6.65 5.40 8.41 7.77 0.50 11.825138 42.590275 28.73
Ireland 1.99 9.13 8.52 5.57 3.42 53.412910 -8.243890 28.63
Tanzania 6.87 6.79 5.44 5.79 2.00 -6.369028 34.888822 26.89

Plot bar graph for the top countries

# Basic histogram

ggplot(top_10_countries, aes(x=country, y=total)) + geom_bar(stat="identity") +
  scale_x_discrete(guide = guide_axis(angle = 90)) + NULL

Mark the top countries on the world map

library("rnaturalearth")
library("rnaturalearthdata")

world <- ne_countries(scale = "medium", returnclass = "sf")

  
  ggplot(data = world) +
    geom_sf() +
  geom_point(data = top_10_countries, aes(x = longitude, y = latitude), color = "#e60000") +
  geom_text(data= top_10_countries,aes(x=longitude, y=latitude, label=country),
    color = "darkblue", fontface = "bold", size = 3, check_overlap = TRUE)  +
  scale_fill_manual(values = c("#CCCCCC","#e60000")) +
  labs(title = 'Countries with highest GDP') +
  theme(text = element_text(family = "Arial", color = "#FFFFFF")
        ,panel.background = element_rect(fill = "#FFFFF4")
        ,plot.background = element_rect(fill = "#FFFFF4")
        ,panel.grid = element_blank()
        ,plot.title = element_text(size = 30)
        ,plot.subtitle = element_text(size = 10)
        ,axis.text = element_blank()
        ,axis.title = element_blank()
        ,axis.ticks = element_blank()
        ,legend.position = "none"
        )