Gapminder_Unfiltered Analysis

Synopsis

This HTML document is associated with the R Markdown file that is used to complete the week 4 assignment.This R markdown file made by me summarizes the packages I have used to complete my homework assignment. RMD file is actually a very helpful tool that summarizes the whole task. Our code along with their results get documented neatly, as summarized below. Initial findings: 1. GDP per capita for year 2007 is highest for European nations and lowest for African nations. 2. Qatar has the highest GDP per capita for the year 2007 among all the countries. 3. India’s GDP per capita over the years has been positive and increased by 348.7%.

Packages Required

Following packages were installed and used:

library(ggplot2)  ##Package to produce complex multi-layered graphs in R
library(gapminder) ##Provides our data in a data frame and tab delimited form
library(tidyverse) ##set of packages that work in harmony because they share common data representations and 'API' design
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats

Source Code

Our Gapminder data is on life expectancy, GDP per capita, and population by country. The supplemental data frame gapminder_unfiltered, our analysis data, was not filtered on year or for complete data and has 3313 rows and 6 columns. Each variable is described below: 1. Country: Tells the name of the country and has 187 distinct values 2. Continent: Tells the continent where the country lies and has 6 disctinct values 3. Year: Ranges from 1952 to 2007 in increments of 5 years 4. lifeExp: Life expectancy at birth, in years 5. pop: Gives the total population of the country 6. gdpPercap: GDP per capita

Data Description

#Understanding the data
?gapminder_unfiltered

#Checking the structure
str(gapminder_unfiltered)
## Classes 'tbl_df', 'tbl' and 'data.frame':    3313 obs. of  6 variables:
##  $ country  : Factor w/ 187 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 6 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ pop      : int  8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
#Converting year into factor variable
gapminder_unfiltered$year<-factor(gapminder_unfiltered$year)

#Number of rows and variables
dim(gapminder_unfiltered)
## [1] 3313    6
#Names of variables
names(gapminder_unfiltered)
## [1] "country"   "continent" "year"      "lifeExp"   "pop"       "gdpPercap"
#Checking top and bottom values
head(gapminder_unfiltered)
## # A tibble: 6 x 6
##       country continent   year lifeExp      pop gdpPercap
##        <fctr>    <fctr> <fctr>   <dbl>    <int>     <dbl>
## 1 Afghanistan      Asia   1952  28.801  8425333  779.4453
## 2 Afghanistan      Asia   1957  30.332  9240934  820.8530
## 3 Afghanistan      Asia   1962  31.997 10267083  853.1007
## 4 Afghanistan      Asia   1967  34.020 11537966  836.1971
## 5 Afghanistan      Asia   1972  36.088 13079460  739.9811
## 6 Afghanistan      Asia   1977  38.438 14880372  786.1134
tail(gapminder_unfiltered)
## # A tibble: 6 x 6
##    country continent   year lifeExp      pop gdpPercap
##     <fctr>    <fctr> <fctr>   <dbl>    <int>     <dbl>
## 1 Zimbabwe    Africa   1982  60.363  7636524  788.8550
## 2 Zimbabwe    Africa   1987  62.351  9216418  706.1573
## 3 Zimbabwe    Africa   1992  60.377 10704340  693.4208
## 4 Zimbabwe    Africa   1997  46.809 11404948  792.4500
## 5 Zimbabwe    Africa   2002  39.989 11926563  672.0386
## 6 Zimbabwe    Africa   2007  43.487 12311143  469.7093
#Replace the missing values with NA
gapminder_unfiltered$gdpPercap[gapminder_unfiltered$gdpPercap==-99] <-NA
gapminder_unfiltered$pop[gapminder_unfiltered$pop==-99] <-NA
gapminder_unfiltered$lifeExp[gapminder_unfiltered$lifeExp==-99] <-NA

#Counting the missing values
sum(is.na(gapminder_unfiltered$gdpPercap==TRUE))
## [1] 0
sum(is.na(gapminder_unfiltered$pop==TRUE))
## [1] 0
sum(is.na(gapminder_unfiltered$lifeExp==TRUE))
## [1] 0
#Displaying summary statistics of final dataset
summary(gapminder_unfiltered)
##            country        continent         year         lifeExp     
##  Czech Republic:  58   Africa  : 637   2002   : 187   Min.   :23.60  
##  Denmark       :  58   Americas: 470   1997   : 184   1st Qu.:58.33  
##  Finland       :  58   Asia    : 578   1992   : 183   Median :69.61  
##  Iceland       :  58   Europe  :1302   2007   : 183   Mean   :65.24  
##  Japan         :  58   FSU     : 139   1977   : 171   3rd Qu.:73.66  
##  Netherlands   :  58   Oceania : 187   1982   : 171   Max.   :82.67  
##  (Other)       :2965                   (Other):2234                  
##       pop              gdpPercap       
##  Min.   :5.941e+04   Min.   :   241.2  
##  1st Qu.:2.680e+06   1st Qu.:  2505.3  
##  Median :7.560e+06   Median :  7825.8  
##  Mean   :3.177e+07   Mean   : 11313.8  
##  3rd Qu.:1.961e+07   3rd Qu.: 17355.8  
##  Max.   :1.319e+09   Max.   :113523.1  
## 

Exploratory Data Analysis - 1

gdppercap2007<-select(filter(gapminder_unfiltered, year==2007), country, continent, gdpPercap)
ggplot(data = gdppercap2007) + 
  geom_point(mapping = aes(x = country, y = gdpPercap)) +
  ggtitle("2007 Country wise gdpPercap distribution") +
  ylab("gdpPercap")+
  xlab("Country")

# For 2007, 0-20000 level of GDP per capita has the most dense distribution. As the level goes up, density decreases.

Exploratory Data Analysis - 2

ggplot(data = gdppercap2007, mapping = aes(x = reorder(continent, gdpPercap, FUN = median), y = gdpPercap, color = continent)) +
  geom_boxplot()+
  ggtitle("2007 Lowest -> Highest Continent wise gdpPercap distribution")

#Africa has the lowest GDP per capita where as Europe has the highest for 2007.

Exploratory Data Analysis - 3

top10_gdppercap2007<-top_n(select(filter(gapminder_unfiltered, year==2007), country, gdpPercap), 10, gdpPercap)
ggplot(top10_gdppercap2007, aes(x = reorder(factor(country), desc(gdpPercap)), y = gdpPercap , fill = factor(country))) + geom_bar(stat = "identity")+
  ggtitle("2007 Top10 Countries with largest gdpPercap") +
  ylab("gdpPercap")+
  xlab("Top10_Countries")

#Qatar has the highest GDP per capita for 2007.

Exploratory Data Analysis - 4

India_gdppercap<-select(filter(gapminder_unfiltered, country == 'India'), year, gdpPercap)
ggplot(India_gdppercap, aes(x = reorder(factor(year), gdpPercap), y = gdpPercap, fill=year)) + geom_bar(stat = "identity")+
  ggtitle("India's gdpPercap over the years") +
  ylab("gdpPercap")+
  xlab("Year")

Exploratory Data Analysis - 5

India_GDP_2002<-select(filter(gapminder_unfiltered, country == 'India' & (year ==2002)),gdpPercap)
India_GDP_2007<-select(filter(gapminder_unfiltered, country == 'India' & (year ==2007)),gdpPercap)
Growth_GDP_2007<-(India_GDP_2007-India_GDP_2002)*100/India_GDP_2002
Growth_GDP_2007
##   gdpPercap
## 1  40.38546

Exploratory Data Analysis - 6

India_GDP_1952<-select(filter(gapminder_unfiltered, country == 'India' & (year ==1952)),gdpPercap)
Growth_GDP_Overall<-(India_GDP_2007-India_GDP_1952)*100/India_GDP_1952
Growth_GDP_Overall
##   gdpPercap
## 1  348.6579
ggplot(India_gdppercap, aes(x = reorder(factor(year), gdpPercap), y = gdpPercap, fill=year)) + geom_bar(stat = "identity")+
  geom_smooth(stat="smooth",position = "identity")+
  ggtitle("India's gdpPercap over the years") +
  ylab("gdpPercap")+
  xlab("Year")

#India's GDP per capita over the years has been positive and increased by 348.7%.