Synopsis

This is the homework for Week 4 Data Wrangling. This week focuses on data manipulations and visualizations in R

Packages required

The following packages are requiblue for this homework

library(gapminder)
library(dplyr)
library(Hmisc)
library(rworldmap)

Source Code

The data set contains the following variables from left to right

names(gapminder_unfiltered)
[1] "country"   "continent" "year"      "lifeExp"   "pop"       "gdpPercap"

Data Description

The describe function shows the descriptive statistics on all the variables including NA’s, # of unique values, etc.
Hmisc::describe(gapminder_unfiltered)
gapminder_unfiltered 

 6  Variables      3313  Observations
---------------------------------------------------------------------------
country 
      n missing  unique 
   3313       0     187 

lowest : Afghanistan        Albania            Algeria            Angola             Argentina         
highest: Vietnam            West Bank and Gaza Yemen, Rep.        Zambia             Zimbabwe           
---------------------------------------------------------------------------
continent 
      n missing  unique 
   3313       0       6 

          Africa Americas Asia Europe FSU Oceania
Frequency    637      470  578   1302 139     187
%             19       14   17     39   4       6
---------------------------------------------------------------------------
year 
      n missing  unique    Info    Mean     .05     .10     .25     .50 
   3313       0      58       1    1980    1952    1957    1967    1982 
    .75     .90     .95 
   1996    2002    2007 

lowest : 1950 1951 1952 1953 1954, highest: 2003 2004 2005 2006 2007 
---------------------------------------------------------------------------
lifeExp 
      n missing  unique    Info    Mean     .05     .10     .25     .50 
   3313       0    2571       1   65.24   41.22   45.37   58.33   69.61 
    .75     .90     .95 
  73.66   77.12   78.68 

lowest : 23.60 28.80 30.00 30.02 30.33
highest: 82.21 82.27 82.36 82.60 82.67 
---------------------------------------------------------------------------
pop 
        n   missing    unique      Info      Mean       .05       .10 
     3313         0      3312         1  31773251    235605    436150 
      .25       .50       .75       .90       .95 
  2680018   7559776  19610538  56737055 121365965 

lowest :      59412      59461      60011      60427      61325
highest: 1110396331 1164970000 1230075000 1280400000 1318683096 
---------------------------------------------------------------------------
gdpPercap 
      n missing  unique    Info    Mean     .05     .10     .25     .50 
   3313       0    3313       1   11314   665.7   887.9  2505.3  7825.8 
    .75     .90     .95 
17355.7 26592.7 31534.9 

lowest :    241.2    277.6    298.8    299.9    312.2
highest:  82011.0  95458.1 108382.4 109347.9 113523.1 
---------------------------------------------------------------------------

Exploratory Data Analysis

Question 1 Distribution of GDP in 2007
# filter 2007 data
year2007<-filter(gapminder_unfiltered, year==2007)

#create a map-shaped window
mapDevice('x11')

#join to a coarse resolution map
spdf <- joinCountryData2Map(year2007, joinCode="NAME", nameJoinColumn="country")

mapCountryData(spdf, nameColumnToPlot="gdpPercap", catMethod="fixedWidth", colourPalette=c('yellow','orange','red','brown'),mapTitle="GDP per capita for 2007")

Question 2 Distribution of GDP in 2007 by continents
GDP_2007_rollup <- with(year2007, tapply(gdpPercap, continent, FUN = sum))
GDP_2007_rollup <- as.data.frame.table(GDP_2007_rollup)
names(GDP_2007_rollup) <- c("continent","gdpContinent")

GDP_2007_combined <- merge(GDP_2007_rollup,year2007,by='continent')

mapDevice('x11')

#join to a coarse resolution map
spdf <- joinCountryData2Map(GDP_2007_combined, joinCode="NAME", nameJoinColumn="country")

mapCountryData(spdf, nameColumnToPlot="gdpContinent", catMethod="fixedWidth", colourPalette=c('yellow','orange','red','brown'),mapTitle="GDP per capita across continents for 2007")

Question 3. The following are the top 10 country by GDP for 2007
head(arrange(year2007,desc(gdpPercap )),10)
Question 4. GDP per Capita of India across years
# filter India data

India<-filter(gapminder_unfiltered,country=='India')

# Plot the data

ggplot(data = India)+
geom_smooth(mapping = aes(x = year, y = gdpPercap),color="Brown",size=2 ,se = FALSE) +
ggtitle("GDP per Capita of India") +
ylab("GDP in USD")+
theme_light()

Question 5.Growth or decline in in the year 2007
gapminder_unfiltered %>%
  group_by(country) %>%
  mutate(percent_growth = {{gdpPercap - lag(gdpPercap)}/{lag(gdpPercap)}}*100)%>%
  filter(year==2007) %>%
  select(country , percent_growth)
Question 6. Growth or Decline of India’s GDP over years
India<-arrange(India,year)

# add a column to India dataset 
mutate(India,growth=0)
nrow(India)

for (i in 1 : nrow(India)){
    if(i< nrow(India)){
    India[i+1,"growth"]<-(India[i+1,"gdpPercap"]- India[i,"gdpPercap"])/India[i,"gdpPercap"]*100
    }
}

# replace the NA with 0
India$growth<-replace(India$growth, is.na(India$growth), 0)

# plot the data

ggplot(data = India,aes(x = year, y = growth))+
    geom_line(colour = "Brown",size=2,arrow=arrow()) + #scale_colour_gradient(low="red") +
    ggtitle("GDP growth of India") +
    ylab("GDP growth")+
    scale_x_continuous(name = India$year, breaks =pretty(India$year,n=10),limits = c(1957,2007))+
    scale_y_continuous(name=India$growth, breaks = pretty(India$growth,n=10))+
    theme_light()