Introduction

This exercise utilises data from the World Bank, which showcases the “ease of doing business” metric by country and region. To guide the data exploration, I would like to know which countries, grouped by continent, has the greatest ease of doing business.

Step 1: Importing and cleaning data

# set your working directory

# import the data and skip the first two rows as they are not relevant for the analysis
headers <- read.csv("worldbank.csv", skip = 3, header = F, nrows = 1, as.is = T)
data <- read.csv("worldbank.csv", skip=5, header=F)
colnames(data) <- headers

# view the data
head(data, 5)
##                  Country Name Country Code
## 1                       Aruba          ABW
## 2 Africa Eastern and Southern          AFE
## 3                 Afghanistan          AFG
## 4  Africa Western and Central          AFW
## 5                      Angola          AGO
##                                                        Indicator Name
## 1 Ease of doing business index (1=most business-friendly regulations)
## 2 Ease of doing business index (1=most business-friendly regulations)
## 3 Ease of doing business index (1=most business-friendly regulations)
## 4 Ease of doing business index (1=most business-friendly regulations)
## 5 Ease of doing business index (1=most business-friendly regulations)
##   Indicator Code 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971
## 1 IC.BUS.EASE.XQ   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 2 IC.BUS.EASE.XQ   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 3 IC.BUS.EASE.XQ   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 4 IC.BUS.EASE.XQ   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 5 IC.BUS.EASE.XQ   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
##   1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986
## 1   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 2   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 3   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 4   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 5   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
##   1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001
## 1   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 2   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 3   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 4   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 5   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
##   2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
## 1   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 2   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 3   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 4   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
## 5   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
##   2017 2018 2019 2020 NA
## 1   NA   NA   NA   NA NA
## 2   NA   NA   NA   NA NA
## 3   NA   NA  173   NA NA
## 4   NA   NA   NA   NA NA
## 5   NA   NA  177   NA NA
# note the columns of the data
names(data)
##  [1] "Country Name"   "Country Code"   "Indicator Name" "Indicator Code"
##  [5] "1960"           "1961"           "1962"           "1963"          
##  [9] "1964"           "1965"           "1966"           "1967"          
## [13] "1968"           "1969"           "1970"           "1971"          
## [17] "1972"           "1973"           "1974"           "1975"          
## [21] "1976"           "1977"           "1978"           "1979"          
## [25] "1980"           "1981"           "1982"           "1983"          
## [29] "1984"           "1985"           "1986"           "1987"          
## [33] "1988"           "1989"           "1990"           "1991"          
## [37] "1992"           "1993"           "1994"           "1995"          
## [41] "1996"           "1997"           "1998"           "1999"          
## [45] "2000"           "2001"           "2002"           "2003"          
## [49] "2004"           "2005"           "2006"           "2007"          
## [53] "2008"           "2009"           "2010"           "2011"          
## [57] "2012"           "2013"           "2014"           "2015"          
## [61] "2016"           "2017"           "2018"           "2019"          
## [65] "2020"           "NA"
# check the number of rows and columns
dim(data)
## [1] 266  66
# narrow down the data to 2019 figures
data19 <- data.frame(data$`Country Name`, data$`2019`)

# remove NAs
data19 <- na.omit(data19)

# rename column names
names(data19) <- c("country", "metric")

# view clean data
head(data19,5)
##                 country metric
## 3           Afghanistan    173
## 5                Angola    177
## 6               Albania     82
## 9  United Arab Emirates     16
## 10            Argentina    126
# check the number of rows and columns
dim(data19)
## [1] 189   2

Step 2: Analysing data

# incorporate the continent group
library(countrycode)
data19$continent <- countrycode(sourcevar=data19[,"country"], origin = "country.name", destination="continent")
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: Kosovo
# remove countries with no mapping
sum(is.na(data19))
## [1] 1
data19 <- na.omit(data19)
head(data19,8)
##                 country metric continent
## 3           Afghanistan    173      Asia
## 5                Angola    177    Africa
## 6               Albania     82    Europe
## 9  United Arab Emirates     16      Asia
## 10            Argentina    126  Americas
## 11              Armenia     47      Asia
## 13  Antigua and Barbuda    113  Americas
## 14            Australia     14   Oceania
# use dplyr summarise, count and group_by functions
library(dplyr)
group_data19 <- data19 %>%
  group_by(continent) %>%
  summarise(mean=round(mean(metric),0), n=n())
group_data19
## # A tibble: 5 × 3
##   continent  mean     n
##   <chr>     <dbl> <int>
## 1 Africa      137    54
## 2 Americas    109    35
## 3 Asia         83    47
## 4 Europe       42    40
## 5 Oceania     108    12
# visualise the data
library(ggplot2)
ggplot(group_data19, aes(x=continent, y=mean)) + geom_bar(stat="identity", fill="blue") + labs(title="Bar Chart", subtitle="Mean of Ease of Doing Business Grouped by Continent", caption="Source: World Bank", x="Continent", y="Mean, Ease of Doing Business")

hist(data19$metric, breaks= 10, ylim=c(0,25), main="Ease of Doing Business by Country", col="#4cbea3", xlab="Country")

Conclusion

Based on the World Bank data, the lower the value of the ease of doing business metric, the greater the ease of doing business in that country. Hence, from the first bar chart, we can conclude that the European continent has the greatest ease of doing business, followed by Asia, Oceania, Americas and then Africa. I was also interested to find out the distribution of the ease of doing business metric across countries. According to the histogram (second chart), the distribution of this metric is more or less even across countries. Aside from the close to 200 mark, there are similar number of countries falling into the bins from 0 to 175.