About The Project

The first 5 countries with top GDP and the last 5 countries with least GDP. Also, the prediction of average GDP of the years were xrayed.

Read Data from Github

df <- read.csv(file="https://raw.githubusercontent.com/nnaemeka-git/global-datasets/main/GDP%20dataset.csv", sep=",",skip=4)
glimpse(df)
## Rows: 266
## Columns: 65
## $ Country.Name   <chr> "Aruba", "Africa Eastern and Southern", "Afghanistan", ~
## $ Country.Code   <chr> "ABW", "AFE", "AFG", "AFW", "AGO", "ALB", "AND", "ARB",~
## $ Indicator.Name <chr> "GDP growth (annual %)", "GDP growth (annual %)", "GDP ~
## $ Indicator.Code <chr> "NY.GDP.MKTP.KD.ZG", "NY.GDP.MKTP.KD.ZG", "NY.GDP.MKTP.~
## $ X1960          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ X1961          <dbl> NA, 1.063696, NA, 1.898596, NA, NA, NA, NA, NA, 5.42784~
## $ X1962          <dbl> NA, 7.4535627, NA, 3.8160733, NA, NA, NA, NA, NA, -0.85~
## $ X1963          <dbl> NA, 5.7405204, NA, 7.0408881, NA, NA, NA, NA, NA, -5.30~
## $ X1964          <dbl> NA, 5.473950, NA, 5.233236, NA, NA, NA, NA, NA, 10.1302~
## $ X1965          <dbl> NA, 5.594137, NA, 4.175162, NA, NA, NA, NA, NA, 10.5694~
## $ X1966          <dbl> NA, 4.0587148, NA, -1.7964361, NA, NA, NA, NA, NA, -0.6~
## $ X1967          <dbl> NA, 5.813018, NA, -9.401674, NA, NA, NA, NA, NA, 3.1919~
## $ X1968          <dbl> NA, 4.0466086, NA, 1.4228191, NA, NA, NA, NA, NA, 4.822~
## $ X1969          <dbl> NA, 5.178724, NA, 15.107822, NA, NA, NA, NA, NA, 9.6795~
## $ X1970          <dbl> NA, 4.8559668, NA, 17.6189590, NA, NA, NA, NA, NA, 3.04~
## $ X1971          <dbl> NA, 5.100963, NA, 10.628708, NA, NA, 4.649465, NA, NA, ~
## $ X1972          <dbl> NA, 2.203884, NA, 3.212971, NA, NA, 8.149743, NA, NA, 1~
## $ X1973          <dbl> NA, 4.4587227, NA, 4.1007589, NA, NA, 7.7884672, NA, NA~
## $ X1974          <dbl> NA, 5.8393223, NA, 10.5332804, NA, NA, 5.6187897, NA, N~
## $ X1975          <dbl> NA, 1.4212719, NA, -1.9082403, NA, NA, 0.5422057, NA, N~
## $ X1976          <dbl> NA, 2.4446599, NA, 8.7734448, NA, NA, 3.3037870, 15.743~
## $ X1977          <dbl> NA, 0.7431271, NA, 4.3362697, NA, NA, 2.8385756, 8.2368~
## $ X1978          <dbl> NA, 1.6475414, NA, -2.5552025, NA, NA, 1.4630002, -0.69~
## $ X1979          <dbl> NA, 3.17439019, NA, 5.19298181, NA, NA, 0.04155719, 11.~
## $ X1980          <dbl> NA, 5.7072943, NA, 2.2789134, NA, NA, 2.2087276, 9.0629~
## $ X1981          <dbl> NA, 4.1814809, NA, -6.6352715, -4.4000012, 5.7456353, -~
## $ X1982          <dbl> NA, 0.20442452, NA, -3.09064074, 0.00000000, 2.94859680~
## $ X1983          <dbl> NA, -0.1678771, NA, -6.1737644, 4.2000014, 1.1049383, 1~
## $ X1984          <dbl> NA, 3.59996156, NA, 0.76628601, 6.00000216, -1.25159665~
## $ X1985          <dbl> NA, -0.3106704, NA, 5.4873474, 3.4999995, 1.7806440, 2.~
## $ X1986          <dbl> NA, 1.801654, NA, 1.338868, 2.900002, 5.637243, 3.25332~
## $ X1987          <dbl> 16.0784314, 3.6267585, NA, 1.2051095, 4.0827486, -0.787~
## $ X1988          <dbl> 18.648649, 4.244825, NA, 4.906503, 6.128890, -1.420040,~
## $ X1989          <dbl> 12.12984055, 2.64672228, NA, 2.32247485, 0.04162146, 9.~
## $ X1990          <dbl> 3.96140173, 0.05297704, NA, 6.43720769, -3.45009868, -9~
## $ X1991          <dbl> 7.96287250, -0.08690589, NA, 1.22080653, 0.99135930, -2~
## $ X1992          <dbl> 5.8823529, -2.1554827, NA, 2.6839716, -5.8382807, -7.18~
## $ X1993          <dbl> 7.3076923, -0.6660327, NA, -1.1609721, -23.9834174, 9.5~
## $ X1994          <dbl> 8.2039028, 2.0872612, NA, -0.2260965, 1.3393634, 8.3028~
## $ X1995          <dbl> 2.547144, 4.308948, NA, 2.011852, 15.000000, 13.322333,~
## $ X1996          <dbl> 1.185788, 5.410609, NA, 4.596463, 13.544370, 9.099999, ~
## $ X1997          <dbl> 7.046874, 3.433427, NA, 3.828704, 7.274277, -10.919984,~
## $ X1998          <dbl> 1.9919859, 1.6576824, NA, 3.6067293, 4.6911465, 8.83008~
## $ X1999          <dbl> 1.238042, 2.672356, NA, 1.403042, 2.181490, 12.889897, ~
## $ X2000          <dbl> 7.6165882, 3.4079519, NA, 3.6116575, 3.0546242, 6.95003~
## $ X2001          <dbl> -2.971257, 3.385073, NA, 5.667418, 4.205999, 8.290070, ~
## $ X2002          <dbl> -3.2736464, 4.0774651, NA, 9.9304160, 13.6656865, 4.539~
## $ X2003          <dbl> 1.9755473, 3.1566477, 8.8322778, 5.8730414, 2.9898500, ~
## $ X2004          <dbl> 7.9115635, 5.4234838, 1.4141180, 8.0173116, 10.9528618,~
## $ X2005          <dbl> 1.2143493, 6.3123412, 11.2297148, 6.0054279, 15.0289153~
## $ X2006          <dbl> 1.050608, 6.832111, 5.357403, 5.257805, 11.547683, 5.90~
## $ X2007          <dbl> 1.800226, 7.104249, 13.826320, 5.588151, 14.010018, 5.9~
## $ X2008          <dbl> -0.09070805, 4.79681982, 3.92498382, 6.17543276, 11.166~
## $ X2009          <dbl> -10.5197485, 1.0394013, 21.3905284, 6.1454385, 0.858712~
## $ X2010          <dbl> -3.6850294, 4.8097831, 14.3624415, 6.6419635, 4.4039325~
## $ X2011          <dbl> 3.446054750, 4.201991866, 0.426354785, 5.004874530, 3.4~
## $ X2012          <dbl> -1.3698630, 3.2409757, 12.7522871, 5.2726115, 8.5421876~
## $ X2013          <dbl> 4.19823232, 4.47030560, 5.60074466, 5.83138307, 4.95454~
## $ X2014          <dbl> 0.3000000, 4.0909182, 2.7245434, 5.8335387, 4.8226276, ~
## $ X2015          <dbl> 5.7000009, 2.7632428, 1.4513147, 2.7351660, 0.9435716, ~
## $ X2016          <dbl> 2.099999586, 2.004538890, 2.260314201, -0.001994532, -2~
## $ X2017          <dbl> 1.9999991, 2.8322876, 2.6470032, 2.1637475, -0.1472129,~
## $ X2018          <dbl> NA, 2.385829, 1.189228, 2.831539, -2.003630, 4.071301, ~
## $ X2019          <dbl> NA, 1.6729245, 3.9116034, 3.1474817, -0.6246443, 2.1736~
## $ X2020          <dbl> NA, -3.5751725, -1.9347782, -0.9789222, -4.0405100, -3.~

Transform dataset with pivot longer

df_long <- df %>% 
  pivot_longer(!c("Country.Name", "Country.Code", "Indicator.Name", "Indicator.Code"),names_to="Year",values_to="GDP")

head(df_long,n=15)
## # A tibble: 15 x 6
##    Country.Name Country.Code Indicator.Name        Indicator.Code    Year    GDP
##    <chr>        <chr>        <chr>                 <chr>             <chr> <dbl>
##  1 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1960    NA
##  2 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1961    NA
##  3 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1962    NA
##  4 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1963    NA
##  5 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1964    NA
##  6 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1965    NA
##  7 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1966    NA
##  8 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1967    NA
##  9 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1968    NA
## 10 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1969    NA
## 11 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1970    NA
## 12 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1971    NA
## 13 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1972    NA
## 14 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1973    NA
## 15 Aruba        ABW          GDP growth (annual %) NY.GDP.MKTP.KD.ZG X1974    NA

Get Year

df_long$Year <- as.numeric(unlist(str_match_all(df_long$Year,"\\d+..")))

Top 5 GDP

df_sel_val <- df_long %>% select(Country.Name,Country.Code,Year,GDP) %>%
  group_by(Country.Name)%>%
  summarise(Avg_GDP=round(mean(GDP,na.rm=TRUE),2),Count=sum(!is.na(GDP)))%>%arrange(desc(Avg_GDP))%>%
  filter(is.na(Avg_GDP) == FALSE)

#Top 5
top_5_gdp <- df_sel_val[1:5,]
top_5_gdp
## # A tibble: 5 x 3
##   Country.Name           Avg_GDP Count
##   <chr>                    <dbl> <int>
## 1 Equatorial Guinea        13.7     40
## 2 Bosnia and Herzegovina    8.97    26
## 3 Qatar                     8.43    20
## 4 Oman                      8.38    54
## 5 China                     8.07    60
top_5_gdp%>% ggplot(aes(reorder(Country.Name,Avg_GDP),Avg_GDP))+
  geom_col(fill="#8CD71A")+geom_text(aes(label=Avg_GDP),color="blue")+ labs(x="Country (billion $)",y="GDP (billion $)", title="Equatorial Guinea has the highest average GDP of 13.67 billion dollar followed by Bosnia\n and Herzegovina with a GDP of 8.97 billion dollar")

Least 5 GDP

least_5_gdp <- tail(df_sel_val,n=5)
least_5_gdp
## # A tibble: 5 x 3
##   Country.Name             Avg_GDP Count
##   <chr>                      <dbl> <int>
## 1 American Samoa             -0.97    17
## 2 Ukraine                    -1.04    33
## 3 Virgin Islands (U.S.)      -1.15    16
## 4 Northern Mariana Islands   -1.8     17
## 5 South Sudan                -4.93     7
least_5_gdp%>% ggplot(aes(reorder(Country.Name,Avg_GDP),Avg_GDP))+
  geom_col(fill="#D77E1A")+geom_text(aes(label=Avg_GDP),color="blue")+ labs(x="Country",y="GDP (billion $)", title="South Sudan has the least average GDP with with a negative balance\n of approximately 4.9 billion. Northern Mariana Islands is the second\n least in the list of 5 lowest average GDP")

World Average GDP Growth by Year

## # A tibble: 60 x 3
##     Year Avg_GDP CounOfGDP
##    <dbl>   <dbl>     <int>
##  1  1970    7.3        141
##  2  1969    6.79       144
##  3  1976    6.26       160
##  4  1964    6.22       128
##  5  1968    6.14       141
##  6  2004    5.97       251
##  7  2006    5.83       252
##  8  2007    5.76       253
##  9  1972    5.47       154
## 10  1974    5.44       154
## # ... with 50 more rows

Prediction Analysis

#Plot
plot(df_sel_val2$Year,df_sel_val2$Avg_GDP)


#Correlation between year and average GDP
cor(df_sel_val2$Year,df_sel_val2$Avg_GDP)
## [1] -0.4327291
#Save model in an object
result <- lm(Avg_GDP~Year,data=df_sel_val2)

#Draw regression line
abline(result)

#Display statistics
summary(result)
## 
## Call:
## lm(formula = Avg_GDP ~ Year, data = df_sel_val2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2053 -0.6952  0.1485  0.8327  2.7638 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 93.48694   24.53146   3.811 0.000337 ***
## Year        -0.04505    0.01232  -3.656 0.000554 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.653 on 58 degrees of freedom
## Multiple R-squared:  0.1873, Adjusted R-squared:  0.1732 
## F-statistic: 13.36 on 1 and 58 DF,  p-value: 0.0005541
#Display intercept and Slope
result
## 
## Call:
## lm(formula = Avg_GDP ~ Year, data = df_sel_val2)
## 
## Coefficients:
## (Intercept)         Year  
##    93.48694     -0.04505
#To know the contents of result
#names(result)

#View fitted values
#result$fitted

#Plot year against fitted values
plot(df_sel_val2$Year,result$fitted)

Create function to predict global GDP

Global_GDP <-function(year){
  GDP <- vector()
  for (i in 1:length(year)){
    GDP[i] <-  (result$coef[2]*year[i])+result$coef[1]
  }
  return(GDP)
 }

Make Prediction

#Predict year 1970, 1969 and 2020 using created function above
Global_GDP(c(1970,1969,2020))
## [1] 4.737864 4.782915 2.485350
#Predict year 1970, 1969 and 2020 using regression default function
predict(result,data.frame(Year=c(1970,1969,2020)))
##        1        2        3 
## 4.737864 4.782915 2.485350
#Make prediction for the years in the dataset
df_sel_val2$Predicted <-Global_GDP(df_sel_val2$Year)
df_sel_pred <- mutate(df_sel_val2,
                      PercentDifference=((abs((df_sel_val2$Avg_GDP-df_sel_val2$Predicted))/df_sel_val2$Predicted))*100)

head(df_sel_pred)
## # A tibble: 6 x 5
##    Year Avg_GDP CounOfGDP Predicted PercentDifference
##   <dbl>   <dbl>     <int>     <dbl>             <dbl>
## 1  1970    7.3        141      4.74              54.1
## 2  1969    6.79       144      4.78              42.0
## 3  1976    6.26       160      4.47              40.1
## 4  1964    6.22       128      5.01              24.2
## 5  1968    6.14       141      4.83              27.2
## 6  2004    5.97       251      3.21              86.2

To compare Year 2020 actual and predicted average GDP

#View the actual and predicted Average GDP for year 2020
pred_2020 <- df_sel_pred%>% filter(Year==2020)
pred_2020
## # A tibble: 1 x 5
##    Year Avg_GDP CounOfGDP Predicted PercentDifference
##   <dbl>   <dbl>     <int>     <dbl>             <dbl>
## 1  2020   -4.72       226      2.49              290.