Synopsis

This HTML document is a report of the completion of 4th week’s assignment. It summarizes the packages I have used, displays the code along with the description of the data and its analysis in the graphical formats.

Packages Required

Following packages were used to perform the exercise:

library(ggplot2)   #To produce graphs in R
library(gapminder) #To retrieve the data
library(tidyverse) #set of packages which share common data representations and 'API' design

Data Analysis of Gapminder_Unfiltered

Source Code

The gapminder_unfiltered, which is our analysis data consists of 3313 rows and the following 6 columns.

1. Country: (Name of the country)
2. Continent: (continent where the country lies)
3. Year: (Values from 1952 to 2007)
4. lifeExp: (Life expectancy at birth, in years)
5. pop: (total population of the country)
6. gdpPercap: (GDP per capita)

Data Description

#Understanding data
gapminder_unfiltered
## # A tibble: 3,313 × 6
##        country continent  year lifeExp      pop gdpPercap
##         <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>
## 1  Afghanistan      Asia  1952  28.801  8425333  779.4453
## 2  Afghanistan      Asia  1957  30.332  9240934  820.8530
## 3  Afghanistan      Asia  1962  31.997 10267083  853.1007
## 4  Afghanistan      Asia  1967  34.020 11537966  836.1971
## 5  Afghanistan      Asia  1972  36.088 13079460  739.9811
## 6  Afghanistan      Asia  1977  38.438 14880372  786.1134
## 7  Afghanistan      Asia  1982  39.854 12881816  978.0114
## 8  Afghanistan      Asia  1987  40.822 13867957  852.3959
## 9  Afghanistan      Asia  1992  41.674 16317921  649.3414
## 10 Afghanistan      Asia  1997  41.763 22227415  635.3414
## # ... with 3,303 more rows
#creating an explicit copy
mygap <- gapminder_unfiltered

#Checking structure
str(mygap)
## Classes 'tbl_df', 'tbl' and 'data.frame':    3313 obs. of  6 variables:
##  $ country  : Factor w/ 187 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 6 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ pop      : int  8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
#Checking for total rows and variables
dim(mygap)
## [1] 3313    6
#Checking top and bottom values
head(mygap)
## # A tibble: 6 × 6
##       country continent  year lifeExp      pop gdpPercap
##        <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan      Asia  1952  28.801  8425333  779.4453
## 2 Afghanistan      Asia  1957  30.332  9240934  820.8530
## 3 Afghanistan      Asia  1962  31.997 10267083  853.1007
## 4 Afghanistan      Asia  1967  34.020 11537966  836.1971
## 5 Afghanistan      Asia  1972  36.088 13079460  739.9811
## 6 Afghanistan      Asia  1977  38.438 14880372  786.1134
tail(mygap)
## # A tibble: 6 × 6
##    country continent  year lifeExp      pop gdpPercap
##     <fctr>    <fctr> <int>   <dbl>    <int>     <dbl>
## 1 Zimbabwe    Africa  1982  60.363  7636524  788.8550
## 2 Zimbabwe    Africa  1987  62.351  9216418  706.1573
## 3 Zimbabwe    Africa  1992  60.377 10704340  693.4208
## 4 Zimbabwe    Africa  1997  46.809 11404948  792.4500
## 5 Zimbabwe    Africa  2002  39.989 11926563  672.0386
## 6 Zimbabwe    Africa  2007  43.487 12311143  469.7093
#Counting NA values if any
sum(mygap$gdpPercap[mygap$gdpPercap==-99]) 
## [1] 0
sum(mygap$pop[mygap$pop==-99]) 
## [1] 0
sum(mygap$lifeExp[mygap$lifeExp==-99])
## [1] 0
#Displaying summary statistics
summary(mygap)
##            country        continent         year         lifeExp     
##  Czech Republic:  58   Africa  : 637   Min.   :1950   Min.   :23.60  
##  Denmark       :  58   Americas: 470   1st Qu.:1967   1st Qu.:58.33  
##  Finland       :  58   Asia    : 578   Median :1982   Median :69.61  
##  Iceland       :  58   Europe  :1302   Mean   :1980   Mean   :65.24  
##  Japan         :  58   FSU     : 139   3rd Qu.:1996   3rd Qu.:73.66  
##  Netherlands   :  58   Oceania : 187   Max.   :2007   Max.   :82.67  
##  (Other)       :2965                                                 
##       pop              gdpPercap       
##  Min.   :5.941e+04   Min.   :   241.2  
##  1st Qu.:2.680e+06   1st Qu.:  2505.3  
##  Median :7.560e+06   Median :  7825.8  
##  Mean   :3.177e+07   Mean   : 11313.8  
##  3rd Qu.:1.961e+07   3rd Qu.: 17355.8  
##  Max.   :1.319e+09   Max.   :113523.1  
## 

GDP per capita across all countries

Result1 <- mygap %>% filter(year==2007) %>%  arrange(desc(gdpPercap))

ggplot(data = Result1) + 
  geom_point(mapping = aes(x = country, y = gdpPercap), color = "green") +
  ggtitle("Distribution of GDP per capita across all countries | 2007") +
  ylab("GDP per capita")+ xlab("Country") 

Result1
## # A tibble: 183 × 6
##             country continent  year lifeExp       pop gdpPercap
##              <fctr>    <fctr> <int>   <dbl>     <int>     <dbl>
## 1             Qatar      Asia  2007  75.588    907229  82010.98
## 2      Macao, China      Asia  2007  80.718    456989  54589.82
## 3            Norway    Europe  2007  80.196   4627926  49357.19
## 4            Brunei      Asia  2007  77.118    386511  48014.59
## 5            Kuwait      Asia  2007  77.588   2505559  47306.99
## 6         Singapore      Asia  2007  79.972   4553009  47143.18
## 7     United States  Americas  2007  78.242 301139947  42951.65
## 8           Ireland    Europe  2007  78.885   4109086  40676.00
## 9  Hong Kong, China      Asia  2007  82.208   6980412  39724.98
## 10      Switzerland    Europe  2007  81.701   7554661  37506.42
## # ... with 173 more rows

Difference in distributions across continents

ggplot(data = Result1)+
  geom_boxplot(mapping = aes(x = reorder(continent, gdpPercap, FUN = median), y = gdpPercap, color = continent)) +
  ggtitle("Difference in distributions across the different continents | 2007") +
  ylab("Continent") + xlab("GDP per capita")

Top 10 countries with the largest GDP per capita

Result3 <- mygap %>% filter (year==2007) %>% 
  select(country, gdpPercap, year) %>% 
  arrange(desc(gdpPercap)) %>% head(10)

ggplot(Result3) +
  geom_bar(mapping = aes(x = reorder(factor(country), desc(gdpPercap)), y = gdpPercap, fill=country), stat = "identity") +
  ggtitle("Top 10 countries with the largest GDP per capita | 2007") +
  ylab("GDP per Capita") + xlab("Countries")

Result3
## # A tibble: 10 × 3
##             country gdpPercap  year
##              <fctr>     <dbl> <int>
## 1             Qatar  82010.98  2007
## 2      Macao, China  54589.82  2007
## 3            Norway  49357.19  2007
## 4            Brunei  48014.59  2007
## 5            Kuwait  47306.99  2007
## 6         Singapore  47143.18  2007
## 7     United States  42951.65  2007
## 8           Ireland  40676.00  2007
## 9  Hong Kong, China  39724.98  2007
## 10      Switzerland  37506.42  2007

GDP per capita for India for all years

Result4 <- mygap %>% filter(country=='India') %>% select(gdpPercap, year)

ggplot(data = Result4) + 
  geom_bar(mapping = aes(x = year, y = gdpPercap, fill=year), stat = "identity", color = "blue") + ggtitle("GDP per capita for India for all years ") +
    ylab("GDP per capita") + xlab("Year")

Result4
## # A tibble: 12 × 2
##    gdpPercap  year
##        <dbl> <int>
## 1   546.5657  1952
## 2   590.0620  1957
## 3   658.3472  1962
## 4   700.7706  1967
## 5   724.0325  1972
## 6   813.3373  1977
## 7   855.7235  1982
## 8   976.5127  1987
## 9  1164.4068  1992
## 10 1458.8174  1997
## 11 1746.7695  2002
## 12 2452.2104  2007

Percent growth (or decline) in GDP per capita

IGDP2002<-select(filter(mygap, country == 'India' & (year ==2002)),gdpPercap)
IGDP2007<-select(filter(mygap, country == 'India' & (year ==2007)),gdpPercap)
Result5<-(IGDP2007-IGDP2002)*100/IGDP2002
Result5
##   gdpPercap
## 1  40.38546

Historical growth (or decline) in GDP per capita for India

Result6<- mygap %>%
  group_by(country)%>%
  filter(country=="India")%>%
  mutate(percent=100*((gdpPercap-lag(gdpPercap))/lag(gdpPercap)))

ggplot(data=Result6)+
  geom_smooth(mapping= aes(x=year,y=gdpPercap), color = "red") + 
  geom_point(mapping= aes(x=year,y=gdpPercap),  color = "yellow") +
    ggtitle("Historical growth (or decline) in GDP per capita for India") + 
      ylab("GDP per capita") +xlab("year")

Result6
## Source: local data frame [12 x 7]
## Groups: country [1]
## 
##    country continent  year lifeExp        pop gdpPercap   percent
##     <fctr>    <fctr> <int>   <dbl>      <int>     <dbl>     <dbl>
## 1    India      Asia  1952  37.373  372000000  546.5657        NA
## 2    India      Asia  1957  40.249  409000000  590.0620  7.958100
## 3    India      Asia  1962  43.605  454000000  658.3472 11.572539
## 4    India      Asia  1967  47.193  506000000  700.7706  6.443935
## 5    India      Asia  1972  50.651  567000000  724.0325  3.319477
## 6    India      Asia  1977  54.208  634000000  813.3373 12.334362
## 7    India      Asia  1982  56.596  708000000  855.7235  5.211394
## 8    India      Asia  1987  58.553  788000000  976.5127 14.115439
## 9    India      Asia  1992  60.223  872000000 1164.4068 19.241341
## 10   India      Asia  1997  61.765  959000000 1458.8174 25.284173
## 11   India      Asia  2002  62.879 1034172547 1746.7695 19.738728
## 12   India      Asia  2007  64.698 1110396331 2452.2104 40.385464