This is the homework for Week 4 Data Wrangling. This week focuses on data manipulations and visualizations in R
The following packages are requiblue for this homework
library(gapminder)
library(dplyr)
library(Hmisc)
library(rworldmap)
The data set contains the following variables from left to right
names(gapminder_unfiltered)
[1] "country" "continent" "year" "lifeExp" "pop" "gdpPercap"
Hmisc::describe(gapminder_unfiltered)
gapminder_unfiltered
6 Variables 3313 Observations
---------------------------------------------------------------------------
country
n missing unique
3313 0 187
lowest : Afghanistan Albania Algeria Angola Argentina
highest: Vietnam West Bank and Gaza Yemen, Rep. Zambia Zimbabwe
---------------------------------------------------------------------------
continent
n missing unique
3313 0 6
Africa Americas Asia Europe FSU Oceania
Frequency 637 470 578 1302 139 187
% 19 14 17 39 4 6
---------------------------------------------------------------------------
year
n missing unique Info Mean .05 .10 .25 .50
3313 0 58 1 1980 1952 1957 1967 1982
.75 .90 .95
1996 2002 2007
lowest : 1950 1951 1952 1953 1954, highest: 2003 2004 2005 2006 2007
---------------------------------------------------------------------------
lifeExp
n missing unique Info Mean .05 .10 .25 .50
3313 0 2571 1 65.24 41.22 45.37 58.33 69.61
.75 .90 .95
73.66 77.12 78.68
lowest : 23.60 28.80 30.00 30.02 30.33
highest: 82.21 82.27 82.36 82.60 82.67
---------------------------------------------------------------------------
pop
n missing unique Info Mean .05 .10
3313 0 3312 1 31773251 235605 436150
.25 .50 .75 .90 .95
2680018 7559776 19610538 56737055 121365965
lowest : 59412 59461 60011 60427 61325
highest: 1110396331 1164970000 1230075000 1280400000 1318683096
---------------------------------------------------------------------------
gdpPercap
n missing unique Info Mean .05 .10 .25 .50
3313 0 3313 1 11314 665.7 887.9 2505.3 7825.8
.75 .90 .95
17355.7 26592.7 31534.9
lowest : 241.2 277.6 298.8 299.9 312.2
highest: 82011.0 95458.1 108382.4 109347.9 113523.1
---------------------------------------------------------------------------
# filter 2007 data
year2007<-filter(gapminder_unfiltered, year==2007)
#create a map-shaped window
mapDevice('x11')
#join to a coarse resolution map
spdf <- joinCountryData2Map(year2007, joinCode="NAME", nameJoinColumn="country")
mapCountryData(spdf, nameColumnToPlot="gdpPercap", catMethod="fixedWidth", colourPalette=c('yellow','orange','red','brown'),mapTitle="GDP per capita for 2007")
GDP_2007_rollup <- with(year2007, tapply(gdpPercap, continent, FUN = sum))
GDP_2007_rollup <- as.data.frame.table(GDP_2007_rollup)
names(GDP_2007_rollup) <- c("continent","gdpContinent")
GDP_2007_combined <- merge(GDP_2007_rollup,year2007,by='continent')
mapDevice('x11')
#join to a coarse resolution map
spdf <- joinCountryData2Map(GDP_2007_combined, joinCode="NAME", nameJoinColumn="country")
mapCountryData(spdf, nameColumnToPlot="gdpContinent", catMethod="fixedWidth", colourPalette=c('yellow','orange','red','brown'),mapTitle="GDP per capita across continents for 2007")
head(arrange(year2007,desc(gdpPercap )),10)
# filter India data
India<-filter(gapminder_unfiltered,country=='India')
# Plot the data
ggplot(data = India)+
geom_smooth(mapping = aes(x = year, y = gdpPercap),color="Brown",size=2 ,se = FALSE) +
ggtitle("GDP per Capita of India") +
ylab("GDP in USD")+
theme_light()
gapminder_unfiltered %>%
group_by(country) %>%
mutate(percent_growth = {{gdpPercap - lag(gdpPercap)}/{lag(gdpPercap)}}*100)%>%
filter(year==2007) %>%
select(country , percent_growth)
India<-arrange(India,year)
# add a column to India dataset
mutate(India,growth=0)
nrow(India)
for (i in 1 : nrow(India)){
if(i< nrow(India)){
India[i+1,"growth"]<-(India[i+1,"gdpPercap"]- India[i,"gdpPercap"])/India[i,"gdpPercap"]*100
}
}
# replace the NA with 0
India$growth<-replace(India$growth, is.na(India$growth), 0)
# plot the data
ggplot(data = India,aes(x = year, y = growth))+
geom_line(colour = "Brown",size=2,arrow=arrow()) + #scale_colour_gradient(low="red") +
ggtitle("GDP growth of India") +
ylab("GDP growth")+
scale_x_continuous(name = India$year, breaks =pretty(India$year,n=10),limits = c(1957,2007))+
scale_y_continuous(name=India$growth, breaks = pretty(India$growth,n=10))+
theme_light()