INTRODUCTION

This is an analysis of data from different countries, focusing on GDP per capita and its predictors. The dataset was retrieved from Kaggle, and contains information on population, region, area size, infant mortality and more.

countries<-read.csv("/Users/janelletang/Documents/git/countries of the world.csv",na.strings=c("", "NA"),dec=",")
head(countries)
##           Country                              Region Population
## 1    Afghanistan        ASIA (EX. NEAR EAST)            31056997
## 2        Albania  EASTERN EUROPE                         3581655
## 3        Algeria  NORTHERN AFRICA                       32930091
## 4 American Samoa  OCEANIA                                  57794
## 5        Andorra  WESTERN EUROPE                           71201
## 6         Angola  SUB-SAHARAN AFRICA                    12127071
##   Area..sq..mi.. Pop..Density..per.sq..mi.. Coastline..coast.area.ratio.
## 1         647500                       48.0                         0.00
## 2          28748                      124.6                         1.26
## 3        2381740                       13.8                         0.04
## 4            199                      290.4                        58.29
## 5            468                      152.1                         0.00
## 6        1246700                        9.7                         0.13
##   Net.migration Infant.mortality..per.1000.births. GDP....per.capita.
## 1         23.06                             163.07                700
## 2         -4.93                              21.52               4500
## 3         -0.39                              31.00               6000
## 4        -20.71                               9.27               8000
## 5          6.60                               4.05              19000
## 6          0.00                             191.19               1900
##   Literacy.... Phones..per.1000. Arable.... Crops.... Other.... Climate
## 1         36.0               3.2      12.13      0.22     87.65       1
## 2         86.5              71.2      21.09      4.42     74.49       3
## 3         70.0              78.1       3.22      0.25     96.53       1
## 4         97.0             259.5      10.00     15.00     75.00       2
## 5        100.0             497.2       2.22      0.00     97.78       3
## 6         42.0               7.8       2.41      0.24     97.35      NA
##   Birthrate Deathrate Agriculture Industry Service
## 1     46.60     20.34       0.380    0.240   0.380
## 2     15.11      5.22       0.232    0.188   0.579
## 3     17.14      4.61       0.101    0.600   0.298
## 4     22.46      3.27          NA       NA      NA
## 5      8.71      6.25          NA       NA      NA
## 6     45.11     24.20       0.096    0.658   0.246
#Renaming Columns
colnames(countries) <- c("Country", "Region", "Population", "Area_SqMiles", "PopDens", 
                         "Coastline", "NetMigration", "InfantMortality", "GDPpc", 
                         "LiteracyRate", "Phones", "Arable_percent", "Crops_percent", 
                         "Other_percent", "ClimateRating","BirthRate","DeathRate",
                         "PrimarySector_percent","SecondarySector_percent","TertiarySector_percent")
summary(countries)
##             Country                                    Region  
##  Afghanistan    :  1   SUB-SAHARAN AFRICA                 :51  
##  Albania        :  1   LATIN AMER. & CARIB                :45  
##  Algeria        :  1   ASIA (EX. NEAR EAST)               :28  
##  American Samoa :  1   WESTERN EUROPE                     :28  
##  Andorra        :  1   OCEANIA                            :21  
##  Angola         :  1   NEAR EAST                          :16  
##  (Other)        :221   (Other)                            :38  
##    Population         Area_SqMiles         PopDens        
##  Min.   :7.026e+03   Min.   :       2   Min.   :    0.00  
##  1st Qu.:4.376e+05   1st Qu.:    4648   1st Qu.:   29.15  
##  Median :4.787e+06   Median :   86600   Median :   78.80  
##  Mean   :2.874e+07   Mean   :  598227   Mean   :  379.05  
##  3rd Qu.:1.750e+07   3rd Qu.:  441811   3rd Qu.:  190.15  
##  Max.   :1.314e+09   Max.   :17075200   Max.   :16271.50  
##                                                           
##    Coastline       NetMigration       InfantMortality      GDPpc      
##  Min.   :  0.00   Min.   :-20.99000   Min.   :  2.29   Min.   :  500  
##  1st Qu.:  0.10   1st Qu.: -0.92750   1st Qu.:  8.15   1st Qu.: 1900  
##  Median :  0.73   Median :  0.00000   Median : 21.00   Median : 5550  
##  Mean   : 21.17   Mean   :  0.03812   Mean   : 35.51   Mean   : 9690  
##  3rd Qu.: 10.35   3rd Qu.:  0.99750   3rd Qu.: 55.70   3rd Qu.:15700  
##  Max.   :870.66   Max.   : 23.06000   Max.   :191.19   Max.   :55100  
##                   NA's   :3           NA's   :3        NA's   :1      
##   LiteracyRate        Phones       Arable_percent  Crops_percent   
##  Min.   : 17.60   Min.   :   0.2   Min.   : 0.00   Min.   : 0.000  
##  1st Qu.: 70.60   1st Qu.:  37.8   1st Qu.: 3.22   1st Qu.: 0.190  
##  Median : 92.50   Median : 176.2   Median :10.42   Median : 1.030  
##  Mean   : 82.84   Mean   : 236.1   Mean   :13.80   Mean   : 4.564  
##  3rd Qu.: 98.00   3rd Qu.: 389.6   3rd Qu.:20.00   3rd Qu.: 4.440  
##  Max.   :100.00   Max.   :1035.6   Max.   :62.11   Max.   :50.680  
##  NA's   :18       NA's   :4        NA's   :2       NA's   :2       
##  Other_percent    ClimateRating     BirthRate       DeathRate     
##  Min.   : 33.33   Min.   :1.000   Min.   : 7.29   Min.   : 2.290  
##  1st Qu.: 71.65   1st Qu.:2.000   1st Qu.:12.67   1st Qu.: 5.910  
##  Median : 85.70   Median :2.000   Median :18.79   Median : 7.840  
##  Mean   : 81.64   Mean   :2.139   Mean   :22.11   Mean   : 9.241  
##  3rd Qu.: 95.44   3rd Qu.:3.000   3rd Qu.:29.82   3rd Qu.:10.605  
##  Max.   :100.00   Max.   :4.000   Max.   :50.73   Max.   :29.740  
##  NA's   :2        NA's   :22      NA's   :3       NA's   :4       
##  PrimarySector_percent SecondarySector_percent TertiarySector_percent
##  Min.   :0.00000       Min.   :0.0200          Min.   :0.0620        
##  1st Qu.:0.03775       1st Qu.:0.1930          1st Qu.:0.4293        
##  Median :0.09900       Median :0.2720          Median :0.5710        
##  Mean   :0.15084       Mean   :0.2827          Mean   :0.5653        
##  3rd Qu.:0.22100       3rd Qu.:0.3410          3rd Qu.:0.6785        
##  Max.   :0.76900       Max.   :0.9060          Max.   :0.9540        
##  NA's   :15            NA's   :16              NA's   :15
dim(countries)
## [1] 227  20

Some obvservations appear to have missing values e.g. GDP per capita should not be NA.

full_countries<-countries

#removing observations with missing values
countries<-na.omit(countries)
#extract legend of a ggplot graph
#https://github.com/hadley/ggplot2/wiki/Share-a-legend-between-two-ggplot2-graphs
get_legend<-function(a.gplot){
  tmp <- ggplot_gtable(ggplot_build(a.gplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)}
####Graphical Analysis

#Bar graph to compare the mean GDP per capita for each region
meanGDP_1<- ggplot(countries) + 
            geom_bar(aes(Region, GDPpc/1000, fill = as.factor(Region)), 
            position = "dodge", stat = "summary", fun.y = "mean") + 
            theme(axis.text.x=element_blank())+
            labs(y="GDP per capita ('000s)",
              title = "Average GDP per capita",fill="Regions")
#Boxplot to show the range for each region
meanGDP_2<- ggplot(countries, aes(x=Region, y=GDPpc, fill=as.factor(Region))) + 
            geom_boxplot(alpha=0.3)+
            theme(axis.text.x=element_blank(),legend.position="none")+
            labs(title = "Boxplot of GDP per capita",fill="Regions")

region_legend<-get_legend(meanGDP_1)
grid.arrange(arrangeGrob(meanGDP_1+theme(legend.position = "none"),
                         meanGDP_2,nrow=2),
             region_legend,ncol=2,widths=c(2.2,1))

#Plot the density of GDP per capita
densityGDP<-plot (density (countries$GDPpc), main="GDP per capita") 
            rug (countries$GDPpc) #adding a "rug" to density estimate - 

                                  #makes it possible to discern the individual data points

It appears that the North American region has the highest average GDP per capita, with a very small range, while the Sub-Saharan African region has the lowest, again with a very small range, but with more outliers.
Asia, both near east and elsewhere, have a large range.

#Total Population per region
pop_plot1<-ggplot(countries) + 
  geom_bar(aes(Region, Population/1000000, fill = as.factor(Region)), 
           position = "dodge", stat = "summary", fun.y = "sum") + 
  theme(axis.text.x=element_blank())+
  labs(title = "Total Population",y="Total Population (millions)",
       fill="Regions")


#Average Population Density per region
pop_plot2<-ggplot(countries) + 
  geom_bar(aes(Region, PopDens, fill = as.factor(Region)), position = "dodge", 
           stat = "summary", fun.y = "mean") + 
  theme(axis.text.x=element_blank())+
  labs(title = "Average Population Density",y="Population Density (Person/sqmile)",
       fill="Regions")

grid.arrange(arrangeGrob(pop_plot2+theme(legend.position="none"),
                         pop_plot1+theme(legend.position="none")),
             region_legend,ncol=2,widths=c(2.2,1))

Comparing population data to mean GDP per capita for each region, Lets see if there are any variables that have a strong correlation to GDP per capita.

cor_matrix<-round(cor(countries[3:20]),1)
cor_GDPpc<-cor(countries$GDPpc,countries[4:20])
cor_GDPpc[1,]
##            Area_SqMiles                 PopDens               Coastline 
##              0.06835595              0.19012217              0.03581518 
##            NetMigration         InfantMortality                   GDPpc 
##              0.37879042             -0.63908984              1.00000000 
##            LiteracyRate                  Phones          Arable_percent 
##              0.52288013              0.88352011              0.04646475 
##           Crops_percent           Other_percent           ClimateRating 
##             -0.20784374              0.06644526              0.36056651 
##               BirthRate               DeathRate   PrimarySector_percent 
##             -0.65879545             -0.24756242             -0.61691880 
## SecondarySector_percent  TertiarySector_percent 
##              0.03285465              0.53655075
#Graphical visualisation of correlation between variables
corrplot(cor_matrix, method="color",type="upper")

corrplot(cor_GDPpc, method="color", cl.pos='n')


From an economic standpoint, many of these observations make sense. It seems obvious that a country with high GDP per capita would have a higher number of phone usage, since higher income means greater ability to afford phone (necessity vs. luxury). Also, higher GDP per capita economies have a larger tertiary sector (since GDP is calculated based on total spending). Similarly, with economic growth, the birth rate would drop due to a number of factors, for example less time for individuals to raise a child, and more years spent in higher education, as well as a deeper understanding of contraception. Additionally, infant mortality drops with economic growth as

Lets take a closer look at some of the highly correlated predictors with GDP per capita.

#create new data frame
countries_infant<-data.frame(full_countries[,c(1,8,9)])
countries_infant<-na.omit(countries_infant)


#plotting the infant mortality rate to show clear correlation with gdppc
infant_graph<-ggplot(countries_infant, aes(y=GDPpc,x=InfantMortality),fill=InfantMortality)+
  geom_point(color=3)+ 
  geom_smooth(color=4)+
  labs(y="GDP per capita",title = "Infant Mortality against GDP per capita")

#create new data frame
countries_lit<-data.frame(full_countries[,c(1,9,10)])
countries_lit<-na.omit(countries_lit)

#plotting literacy rates against GDPpc
lit_graph<-ggplot(countries_lit, aes(LiteracyRate,GDPpc),fill=LiteracyRate)+
  geom_point(color=3)+ 
  geom_smooth(color=4)+
  labs(y="GDP per capita",title = "Literacy rate against GDP per capita")

grid.arrange(infant_graph,lit_graph,ncol=2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

While real GDP remains the most used when measuring economic growth and living standards, in many cases, the human development index (HDI) is used to measure economic development, which is similar to economic growth, but encompasses many other factors, such as poverty rate, income equality, sustainability etc. The HDI is a composite measure comprising of:

Of course, this does not cover all of the factors that defines economic development, but already does a better job than GDP per capita. Life expectancy from birth would be significantly lowered with higher infant mortality rates.
It appears that high level of GDP per capita indicates higher level of literacy rates and lower levels of infant mortality, but low levels of GDP per capita results in high variance of both predictors. This suggests that incorporating literacy rate into a composite measure can help more accurately determine countries with higher level of GDP per capita, but not necessarily countries with lower levels.

#create new data frame for analysing birth and death rates
countries_bd<-data.frame(full_countries[,c(1,9,16,17)])
countries_bd[countries_bd==""] <- NA
countries_bd<-na.omit(countries_bd)
head(countries_bd)
##           Country GDPpc BirthRate DeathRate
## 1    Afghanistan    700     46.60     20.34
## 2        Albania   4500     15.11      5.22
## 3        Algeria   6000     17.14      4.61
## 4 American Samoa   8000     22.46      3.27
## 5        Andorra  19000      8.71      6.25
## 6         Angola   1900     45.11     24.20
#plotting the birth and death rates data to see an overview trend
birth_graph<-ggplot(countries_bd, aes(y=GDPpc,x=BirthRate),fill=BirthRate)+
  geom_point(color=3)+ 
  geom_smooth(color=4)+
  labs(y="GDP per capita",title = "Birth rate against GDP per capita")
death_graph<-ggplot(countries_bd, aes(y=GDPpc,x=DeathRate),fill=DeathRate)+
  geom_point(color=6)+ 
  geom_smooth(color=4)+geom_smooth(method="lm",color=7)+
  labs(y="GDP per capita",title = "Death rate against GDP per capita")
grid.arrange(birth_graph, death_graph,ncol=2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'


We can see that death rates actually vary quite a bit, with a slight downward trend, but with great fluctuations. Initially, in low development countries, the death rate appears to be quite sparse, which can be reflective of the different types of living conditions and climates that vary across LEDCs. However, with industrialisation, the death rate should drop quickly, by removing high cause of death factors such as malnourishment, and curable diseases. However, at higher levesl of GDP per capita, the death rate begins to rise again. We could theorise that individuals would be working more, which results in a more unhealthy lifestyle, and less energy to care of the elderly. Additionally, with higher life expectancy comes more health risks such as cancer. However, there would need to be additional analysis on datasets focusing on death rates with lifestyle choices to make a clear judgement.
br/> We also see a similar effect to literacy rates and infant mortality - with higher GDP per capita, the variability of birth and death rates decreases.

Another significant factor is the composition of primary, secondary, and tertiary sector in an economy.

#create new data frame
countries_sec<-data.frame(full_countries[,c(1,9,18,19,20)])
countries_sec<-na.omit(countries_sec)
head(countries_sec)
##              Country GDPpc PrimarySector_percent SecondarySector_percent
## 1       Afghanistan    700                 0.380                   0.240
## 2           Albania   4500                 0.232                   0.188
## 3           Algeria   6000                 0.101                   0.600
## 6            Angola   1900                 0.096                   0.658
## 7          Anguilla   8600                 0.040                   0.180
## 8 Antigua & Barbuda  11000                 0.038                   0.220
##   TertiarySector_percent
## 1                  0.380
## 2                  0.579
## 3                  0.298
## 6                  0.246
## 7                  0.780
## 8                  0.743
#create data frame for top and bottom 25 countries ranked by GDP per capita
top25<-head(arrange(countries_sec,desc(GDPpc)), n = 25)
top25<-top25[,!(names(top25) == 'GDPpc')]
top25$Country <- factor(top25$Country, levels = top25$Country)
#arrange data vertical so that we can plot a stacked bar graph
top25.m <- melt(top25,id.vars = "Country") 


bottom25<-head(arrange(countries_sec,desc(-GDPpc)), n = 25)
bottom25<-bottom25[,!(names(bottom25) == 'GDPpc')]
bottom25$Country <- factor(bottom25$Country, levels = bottom25$Country)
bottom25.m <- melt(bottom25,id.vars = "Country")


#Graphical visualisation of proportion of sectors
ggplot(top25.m, aes(x = Country, y = value,fill=variable)) +
    geom_bar(stat='identity')+
    theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.2),
          axis.title.y=element_blank())+
    scale_y_continuous(labels=percent)+
    labs(title = "Proportion of different sectors in high GDP per capita countries",
         fill="Sectors")

ggplot(bottom25.m, aes(x = Country, y = value,fill=variable)) +
    geom_bar(stat='identity')+
    theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.2),
          axis.title.y=element_blank())+
    scale_y_continuous(labels=percent)+
    labs(title = "Proportion of different sectors in low GDP per capita countries",
         fill="Sectors")


It is clear that for the higher GDP per capita countries, services take up a larger proportion of the economy, while agricultural/primary commodities take up a very insignificant percentage. The opposite is often true for low GDP per capita countries, where primary commodities take up a larger proportion - this is not always true, as in the case of East Timor, with the lowest GDP per capita, among others. It is likely that these are ust outliers. Additionally, most of these low income per capita countries appear to have a wider variety of composition.