Day 3 Example

Basic Recoding of variables

First we load our data and the mosaic library

library(mosaic)
## Loading required package: car
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'mosaic'
## 
## The following objects are masked from 'package:dplyr':
## 
##     do, tally
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cov, D, fivenum, IQR, median, prop.test, sd,
##     t.test, var
## 
## The following objects are masked from 'package:base':
## 
##     max, mean, min, print, prod, range, sample, sum
library(RCurl)
## Loading required package: bitops
url<-"https://raw.githubusercontent.com/coreysparks/data/master/PRB2013_new.csv"
prbdata<-getURL(url)
prbdata<-read.csv(textConnection(prbdata), header=T, dec=",")

do a simple frequency table of continents

table(prbdata$Continent) 
## 
##        Africa          Asia        Europe North America       Oceania 
##            55            51            45            27            17 
## South America 
##            13

note, this is the same as:

summary(prbdata$Continent)
##        Africa          Asia        Europe North America       Oceania 
##            55            51            45            27            17 
## South America 
##            13

recode a variable

prbdata$Africa<-ifelse(prbdata$Continent=="Africa",yes= "Africa",no= "Not Africa")

Now we have a variable in our data that is dichotomous for if a country is in Africa or not

table(prbdata$Africa) 
## 
##     Africa Not Africa 
##         55        153

Now we can use our new variable to do some descriptive analysis

mean(e0Total~Africa, data=prbdata, na.rm=T)
##     Africa Not Africa 
##      59.60      74.49
sd(e0Total~Africa, data=prbdata, na.rm=T)
##     Africa Not Africa 
##      8.608      5.467
bwplot(e0Total~Africa, prbdata)

plot of chunk unnamed-chunk-6

sometimes, we will need to construct a group variable based on the values of a continuous/numeric variable. One way to do this is to use quantiles

quantile(prbdata$GNIPPPperCapitaUSDollars2012, na.rm=T) 
##    0%   25%   50%   75%  100% 
##   370  2905  9390 21790 84670

These are derived from the Empirical Cumulative Density Function, \(latex Pr(x<=X) \)

plot(ecdf(prbdata$GNIPPPperCapitaUSDollars2012))

plot of chunk unnamed-chunk-8

We can use the quantiles to help us make a grouping variable using the cut() function. Here I do this to GDP per capita

prbdata$GDPgroup<-cut(prbdata$GNIPPPperCapitaUSDollars2012, 
                      breaks=quantile(prbdata$GNIPPPperCapitaUSDollars2012,
                                      probs=c(0,.25, .5, .75, 1), na.rm=T))

There should be ~25% of the data in each group

prop.table(table(prbdata$GDPgroup))
## 
##      (370,2.90e+03] (2.90e+03,9.39e+03] (9.39e+03,2.18e+04] 
##              0.2472              0.2528              0.2472 
## (2.18e+04,8.47e+04] 
##              0.2528

Plot life expectancy by the new GDP group variable

mean(e0Total~GDPgroup, data=prbdata, na.rm=T)
##      (370,2.90e+03] (2.90e+03,9.39e+03] (9.39e+03,2.18e+04] 
##               58.89               69.80               72.64 
## (2.18e+04,8.47e+04] 
##               79.24
bwplot(e0Total~GDPgroup, prbdata, main="Life expectancy by GDP group")

plot of chunk unnamed-chunk-11

We can use both of these variables to examine the pattern of life expectancy by Africa/Non African county and by GDP group

bwplot(e0Total~GDPgroup|Africa, prbdata, main="Life expectancy by GDP group by Africa/Not Africa")

plot of chunk unnamed-chunk-12