Synopsis

This HTML document is created from the associated R Markdown file. The data frame I have used this week is called Gapminder_unfiltered which is an excerpt of Gapminder data from Gapminder.org. In this assignment, I have performed data transformation and manipulations tasks using common functions of dplyr package. Furthermore, I have performed exploratory data analysis by combining transformed data and data visualization techniques to achieve following findings:

  1. Maximum countries across the world have GDP/capita less than 20000
  2. Europe has largest and Africa has smallest GDP/capita for year 2007
  3. Qatar has largest GDP/capita in year 2007
  4. Growth percentage in GDP/capita has always been positive for India

Packages Required

Package(s) used in this assignment to exceute R code are mentioned below:

library(ggplot2)  #Package to produce complex multi-layered graphs in R
library(gapminder) #Excerpt from the Gapminder data in plain text delimited form
library(tidyverse) #Set of packages including dpylr and ggplot

Source Code

The data set gapminder_unfiltered is an excerpt of the Gapminder data on life expectancy, GDPpercapita, and population by country. This data frame is not filtered on year and has all 3313 observations and 6 variables described below:

  1. country : data frame has data for 187 countries

  2. continent : data frame has data for 6 continents

  3. year: calendar years range from 1950 to 2007 in increments of 5 years

  4. lifeExp: life expectancy at birth(in years)

  5. pop: population of a country in a particular year

  6. gdpPercap: per capita GDP of a country in that particular year

Note: The data set contains no missing values or empty rows.

Data Description

gapminder_data <- gapminder_unfiltered

#Number of rows and variables
dim(gapminder_data)
## [1] 3313    6
#Names of variables
names(gapminder_data)
## [1] "country"   "continent" "year"      "lifeExp"   "pop"       "gdpPercap"
#Checking top and bottom values
head(gapminder_data)
## # A tibble: 6 × 6
##       country continent  year lifeExp      pop gdpPercap
##        <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan      Asia  1952  28.801  8425333  779.4453
## 2 Afghanistan      Asia  1957  30.332  9240934  820.8530
## 3 Afghanistan      Asia  1962  31.997 10267083  853.1007
## 4 Afghanistan      Asia  1967  34.020 11537966  836.1971
## 5 Afghanistan      Asia  1972  36.088 13079460  739.9811
## 6 Afghanistan      Asia  1977  38.438 14880372  786.1134
tail(gapminder_data)
## # A tibble: 6 × 6
##    country continent  year lifeExp      pop gdpPercap
##     <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>
## 1 Zimbabwe    Africa  1982  60.363  7636524  788.8550
## 2 Zimbabwe    Africa  1987  62.351  9216418  706.1573
## 3 Zimbabwe    Africa  1992  60.377 10704340  693.4208
## 4 Zimbabwe    Africa  1997  46.809 11404948  792.4500
## 5 Zimbabwe    Africa  2002  39.989 11926563  672.0386
## 6 Zimbabwe    Africa  2007  43.487 12311143  469.7093
#Counting missing values
sum(is.na(gapminder_data$lifeExp))
## [1] 0
sum(is.na(gapminder_data$pop))
## [1] 0
sum(is.na(gapminder_data$gdpPercap))
## [1] 0
# See what all rows have incomplete data
gapminder_data[!complete.cases(gapminder_data),]
## # A tibble: 0 × 6
## # ... with 6 variables: country <fctr>, continent <fctr>, year <int>,
## #   lifeExp <dbl>, pop <int>, gdpPercap <dbl>
#Displaying summary statistics of final dataframe
summary(gapminder_data)
##            country        continent         year         lifeExp     
##  Czech Republic:  58   Africa  : 637   Min.   :1950   Min.   :23.60  
##  Denmark       :  58   Americas: 470   1st Qu.:1967   1st Qu.:58.33  
##  Finland       :  58   Asia    : 578   Median :1982   Median :69.61  
##  Iceland       :  58   Europe  :1302   Mean   :1980   Mean   :65.24  
##  Japan         :  58   FSU     : 139   3rd Qu.:1996   3rd Qu.:73.66  
##  Netherlands   :  58   Oceania : 187   Max.   :2007   Max.   :82.67  
##  (Other)       :2965                                                 
##       pop              gdpPercap       
##  Min.   :5.941e+04   Min.   :   241.2  
##  1st Qu.:2.680e+06   1st Qu.:  2505.3  
##  Median :7.560e+06   Median :  7825.8  
##  Mean   :3.177e+07   Mean   : 11313.8  
##  3rd Qu.:1.961e+07   3rd Qu.: 17355.8  
##  Max.   :1.319e+09   Max.   :113523.1  
## 
gapminder_data %>% distinct(continent) %>% arrange(continent)
## # A tibble: 6 × 1
##   continent
##      <fctr>
## 1    Africa
## 2  Americas
## 3      Asia
## 4    Europe
## 5       FSU
## 6   Oceania
gapminder_data %>% distinct(country) %>% arrange(country)
## # A tibble: 187 × 1
##        country
##         <fctr>
## 1  Afghanistan
## 2      Albania
## 3      Algeria
## 4       Angola
## 5    Argentina
## 6      Armenia
## 7        Aruba
## 8    Australia
## 9      Austria
## 10  Azerbaijan
## # ... with 177 more rows
gapminder_data %>% distinct(year) %>% arrange(desc(year))
## # A tibble: 58 × 1
##     year
##    <int>
## 1   2007
## 2   2006
## 3   2005
## 4   2004
## 5   2003
## 6   2002
## 7   2001
## 8   2000
## 9   1999
## 10  1998
## # ... with 48 more rows
gapminder_data %>% filter(year==2007) %>% group_by(continent) %>% summarise(no_countries = n_distinct(country)) %>% arrange(continent)
## # A tibble: 6 × 2
##   continent no_countries
##      <fctr>        <int>
## 1    Africa           53
## 2  Americas           33
## 3      Asia           43
## 4    Europe           34
## 5       FSU            9
## 6   Oceania           11
gapminder_data %>% filter(year==2007) %>% select(continent,country)
## # A tibble: 183 × 2
##    continent     country
##       <fctr>      <fctr>
## 1       Asia Afghanistan
## 2     Europe     Albania
## 3     Africa     Algeria
## 4     Africa      Angola
## 5   Americas   Argentina
## 6        FSU     Armenia
## 7   Americas       Aruba
## 8    Oceania   Australia
## 9     Europe     Austria
## 10      Asia  Azerbaijan
## # ... with 173 more rows

Expolatory Data Analysis

Analysis 1

#For the year 2007, what is the distribution of GDP per capita across all countries?

gapminder_data %>%
  filter(year==2007) %>%
  ggplot() + 
  geom_histogram(mapping = aes(x = gdpPercap), binwidth = 1500, color = "blue") +
  ggtitle("Distribution of GDP/capita for 2007 across all countries") +
  labs(x="GDP/capita", y="# of countries")

#Result: The graph below GDP/capita equal to 20000 is densely populated and it further 20000 mark, the density keep on decreasing. So, we can conclude that maximum countries across the world has GDP/capita less than 20000

Analysis 2

#For the year 2007, how do the distributions differ across the different continents?

gapminder_data %>% filter(year==2007) %>% arrange(continent) %>%
ggplot(data = gapminder_data,mapping = aes(x = continent,y=gdpPercap,color = continent)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 1) +
coord_flip() +
ggtitle("Distribution of GDP/Capita for different continents in 2007") +
labs(y="GDP/capita", x="Continents")

#Result: The boxplot graph shows that developed regions like Europe and Oceania have large GDP/capita whereas lesser developed or underdeveloped regions like Africa have small GDP/capitain year 2007

Analysis 3

#For the year 2007, what are the top 10 countries with the largest GDP per capita?

gdpPercap_top10 <- gapminder_data %>% filter(year==2007) %>% 
select(country,gdpPercap) %>% arrange(desc(gdpPercap)) %>% head(n=10)
gdpPercap_top10 %>% ggplot() +
geom_bar(mapping = aes(x = country,y=gdpPercap,fill = country),stat="Identity") +
ggtitle(" Top 10 countries with the largest GDP/capita in 2007") +
labs(y="GDP/capita", x="Countries")

#Result: The bar graph clearly suggests that Qatar has largest GDP/capita among top 10 countries with largest GDP/capita in year 2007 

Analysis 4

#Plot the GDP per capita for your country of origin for all years available

gapminder_data %>% filter(country=="India") %>%
ggplot() +
geom_step(mapping = aes(x = year,y=gdpPercap),linetype=1,color='blue') +
ggtitle("GDP of India across all years") +
labs(y="GDP/capita", x="Year")

#Result: GDP/capita is continuosly rising for India since 1950

Analysis 5

#What was the percent growth (or decline) in GDP per capita in 2007?

GDP_prev_year <- gapminder_data %>% filter(year==2002,country == 'India') %>% select(gdpPercap) 
GDP_curr_year <- gapminder_data %>% filter(year==2007,country == 'India') %>% select(gdpPercap)
Percentage_growth = ((GDP_curr_year - GDP_prev_year)/GDP_prev_year)*100
Percentage_growth
##   gdpPercap
## 1  40.38546

Analysis 6

#What has been the historical growth (or decline) in GDP per capita for your country?

historical_gdp <- gapminder_data %>% filter(country=='India') %>% mutate(GrowthPercentage=0) %>% arrange(year)
nrow(historical_gdp)
## [1] 12
for (x in 1 : nrow(historical_gdp)){
  if(x< nrow(historical_gdp)){
    historical_gdp[x+1,"GrowthPercentage"]<-(historical_gdp[x+1,"gdpPercap"]- historical_gdp[x,"gdpPercap"])/historical_gdp[x,"gdpPercap"]*100
  }
}

historical_gdp %>% 
ggplot() +
geom_bar(mapping = aes(x = year , y = GrowthPercentage, fill = year),stat="Identity") +
ggtitle("Historical % change in GDP/capita for India") +
labs(x="Year", y="% change in GDP/capita")

#Result: India has maintained positive percentage of growth in GDP/capita for all the years. The change of over 40% was maximum from year 2002 and 2007