First we load our data and the mosaic library
library(mosaic)
## Loading required package: car
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'mosaic'
##
## The following objects are masked from 'package:dplyr':
##
## do, tally
##
## The following object is masked from 'package:car':
##
## logit
##
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cov, D, fivenum, IQR, median, prop.test, sd,
## t.test, var
##
## The following objects are masked from 'package:base':
##
## max, mean, min, print, prod, range, sample, sum
library(RCurl)
## Loading required package: bitops
url<-"https://raw.githubusercontent.com/coreysparks/data/master/PRB2013_new.csv"
prbdata<-getURL(url)
prbdata<-read.csv(textConnection(prbdata), header=T, dec=",")
do a simple frequency table of continents
table(prbdata$Continent)
##
## Africa Asia Europe North America Oceania
## 55 51 45 27 17
## South America
## 13
note, this is the same as:
summary(prbdata$Continent)
## Africa Asia Europe North America Oceania
## 55 51 45 27 17
## South America
## 13
recode a variable
prbdata$Africa<-ifelse(prbdata$Continent=="Africa",yes= "Africa",no= "Not Africa")
Now we have a variable in our data that is dichotomous for if a country is in Africa or not
table(prbdata$Africa)
##
## Africa Not Africa
## 55 153
Now we can use our new variable to do some descriptive analysis
mean(e0Total~Africa, data=prbdata, na.rm=T)
## Africa Not Africa
## 59.60 74.49
sd(e0Total~Africa, data=prbdata, na.rm=T)
## Africa Not Africa
## 8.608 5.467
bwplot(e0Total~Africa, prbdata)
sometimes, we will need to construct a group variable based on the values of a continuous/numeric variable. One way to do this is to use quantiles
quantile(prbdata$GNIPPPperCapitaUSDollars2012, na.rm=T)
## 0% 25% 50% 75% 100%
## 370 2905 9390 21790 84670
These are derived from the Empirical Cumulative Density Function, \(latex Pr(x<=X) \)
plot(ecdf(prbdata$GNIPPPperCapitaUSDollars2012))
We can use the quantiles to help us make a grouping variable using the cut() function. Here I do this to GDP per capita
prbdata$GDPgroup<-cut(prbdata$GNIPPPperCapitaUSDollars2012,
breaks=quantile(prbdata$GNIPPPperCapitaUSDollars2012,
probs=c(0,.25, .5, .75, 1), na.rm=T))
There should be ~25% of the data in each group
prop.table(table(prbdata$GDPgroup))
##
## (370,2.90e+03] (2.90e+03,9.39e+03] (9.39e+03,2.18e+04]
## 0.2472 0.2528 0.2472
## (2.18e+04,8.47e+04]
## 0.2528
Plot life expectancy by the new GDP group variable
mean(e0Total~GDPgroup, data=prbdata, na.rm=T)
## (370,2.90e+03] (2.90e+03,9.39e+03] (9.39e+03,2.18e+04]
## 58.89 69.80 72.64
## (2.18e+04,8.47e+04]
## 79.24
bwplot(e0Total~GDPgroup, prbdata, main="Life expectancy by GDP group")
We can use both of these variables to examine the pattern of life expectancy by Africa/Non African county and by GDP group
bwplot(e0Total~GDPgroup|Africa, prbdata, main="Life expectancy by GDP group by Africa/Not Africa")