Organizing Data and Playing with ggplot2!

For this exercise we will first collect some data from the web and merge it into a single file, and then apply some of the basic functionality of ggplot2 to it.

As I will need to scrap many data sets, I organise a function for that:

scrapData <- function(link, whichTable, whichRowsOut = NULL, toFactor = FALSE) {
    require(XML)
    dirtyTable <- getNodeSet(htmlParse(link), "//table")[[whichTable]]
    table <- readHTMLTable(dirtyTable, skip.rows = whichRowsOut, trim = TRUE, 
        stringsAsFactors = toFactor)
}

I took care to organise the input parameters in such a way that it is flexible to the way the data in the web page is presented. As it is clear from the function, we always need to provide a link and to indicate which table to scrap, and optionally inform which rows should be eliminated and if we need to convert strings to factors.

Now, I get the links needed (visit the links to understand what data I am scrapping!), and apply the function to them::

freedomLink = "http://en.wikipedia.org/wiki/List_of_Indices_of_Freedom"
terrorismLink = "http://en.wikipedia.org/wiki/Global_Terrorism_Index"
regionsLink = "http://cloford.com/resources/codes/"
Freedom <- scrapData(freedomLink, 3)
Terrorism <- scrapData(terrorismLink, 2)
Regions <- scrapData(regionsLink, 5)

I also have the Human Development Index in a web repository in a simple csv format:

library(RCurl)
data <- getURL("https://raw.github.com/JoseManuelMAGALLANES/Repository/master/blog2/hdi.csv")
HDI <- read.csv(text = data)

I collect the combinations of the 3 data sets into dataCountries. For that I have identified that they all have a column named “Country”:

dataCountries <- merge(Regions, Freedom, by.x = "Country", by.y = "Country")
dataCountries <- merge(dataCountries, Terrorism, by.x = "Country", by.y = "Country")
dataCountries <- merge(dataCountries, HDI, by.x = "Country", by.y = "Country")

It is important to notice that the every dataset had different number of countries.This time, since we want to taste some ggplot2, I will not do data cleaning (which should be the case). So, I am conscious that I will have a smaller set of countries. So, let's play attention to what variables I have:

names(dataCountries)

##  [1] "Country"                        "Continent"                     
##  [3] "Region"                         "Capital"                       
##  [5] "FIPS"                           "ISO (2)"                       
##  [7] "ISO (3)"                        "ISO (No)"                      
##  [9] "Internet"                       "Note"                          
## [11] "Freedom in the World 2013"      "2013 Index of Economic Freedom"
## [13] "2013 Press Freedom Index"       "2012 Democracy Index"          
## [15] "Rank"                           "Score"                         
## [17] "hdi"

Here we need some manipulation to leave out the variables (columns) we are not interested, shortening the names of the chosen ones and verifying their structure:

dataCountries <- dataCountries[c(-4:-10, -12, -13, -15)]
names(dataCountries) = c("Country", "Continent", "Region", "Freedom", "Democracy", 
    "Terrorism", "HDI")
str(dataCountries)

## 'data.frame':    147 obs. of  7 variables:
##  $ Country  : chr  "Afghanistan" "Albania" "Algeria" "Angola" ...
##  $ Continent: chr  "Asia" "Europe" "Africa" "Africa" ...
##  $ Region   : chr  "South Asia" "South East Europe" "Northern Africa" "Southern Africa" ...
##  $ Freedom  : chr  "5 not free" "3 partly free" "5 not free" "5 not free" ...
##  $ Democracy: chr  "5 authoritarian regime" "4 hybrid regime" "5 authoritarian regime" "5 authoritarian regime" ...
##  $ Terrorism: chr  "8.669" "0.119" "5.831" "1.696" ...
##  $ HDI      : num  0.37 0.75 0.71 0.51 0.81 0.73 0.94 0.9 0.8 0.52 ...

There is something bothering us, some variables are not being read as numbers, so let's make the changes and check again:

for (i in 2:6) {
    if (i > 5) {
        dataCountries[, i] = as.double(dataCountries[, i])
    } else {
        dataCountries[, i] = as.factor(dataCountries[, i])
    }
}
summary(dataCountries)

##    Country             Continent                Region  
##  Length:147         Africa  :44   South West Asia  :17  
##  Class :character   Americas:25   Western Africa   :15  
##  Mode  :character   Asia    :41   South America    :12  
##                     Europe  :34   Southern Africa  :10  
##                     Oceania : 3   South East Asia  : 8  
##                                   South East Europe: 8  
##                                   (Other)          :77  
##           Freedom                    Democracy    Terrorism   
##  1 free       :63   1 full democracy      :22   Min.   :0.00  
##  3 partly free:46   2 flawed democracy    :48   1st Qu.:0.02  
##  5 not free   :38   4 hybrid regime       :35   Median :1.13  
##                     5 authoritarian regime:41   Mean   :2.01  
##                     n/a                   : 1   3rd Qu.:3.58  
##                                                 Max.   :9.56  
##                                                               
##       HDI       
##  Min.   :0.304  
##  1st Qu.:0.524  
##  Median :0.711  
##  Mean   :0.672  
##  3rd Qu.:0.810  
##  Max.   :0.955  
##  NA's   :1

It is better but the 'n/a' is not standard missing value in R 'NA', and we want to change it:

grep("/", dataCountries[, 5])  # where is it?

## [1] 120

dataCountries[120, 5] = NA  # making the change
dataCountries[, 5] = factor(dataCountries[, 5])  #recoding as factors
dataCountries = na.omit(dataCountries)

Now we are ready to PLOT!!.

TIME TO PLOT (I)

What is the relationship between the levels of Human Development and Terrorism in the countries of the world?

library(ggplot2)
qplot(HDI, Terrorism, data = dataCountries) + labs(title = "Human Development (HDI) and Terrorism \n (World Countries, year 2012)")

plot of chunk unnamed-chunk-9

Coloring the dots by Continent helps us get some better understanding:

qplot(HDI, Terrorism, data = dataCountries, size = I(5), colour = Continent) + 
    labs(title = "Human Development (HDI) and Terrorism by Continents \n (World Countries, year 2012)")

plot of chunk unnamed-chunk-10

As we see, countries are too varied even within Continent, so we may use one of our factor variables (Democracy) to partition the data:

qplot(HDI, Terrorism, data = dataCountries, size = I(5), colour = Democracy) + 
    labs(title = "Human Development (HDI) and Terrorism by level of Democracy \n (World Countries, year 2012)")

plot of chunk unnamed-chunk-11

We could also make the comparison using the factor Freedom, and see what is reported:

qplot(HDI, Terrorism, data = dataCountries, size = I(5), colour = Freedom, geom = c("point", 
    "smooth"), span = 1, se = F) + labs(title = "Human Development (HDI) and Terrorism by level of Freedom \n (World Countries, year 2012)")

## geom_smooth: method="auto" and size of largest group is <1000, so using
## loess. Use 'method = x' to change the smoothing method.

plot of chunk unnamed-chunk-12

As it is seen, we have use the command parameter smooth which allows us look for the functional relationship between the variables, using the partioning criteria in this case.

Finally, we can combine all our variables like this:

qplot(HDI, Terrorism, data = dataCountries, size = I(2), colour = Continent, 
    facets = Freedom ~ Democracy, geom = "point") + labs(title = "Human Development (HDI) and Terrorism by level of Freedom and Democracy \n (World Countries by Continent, year 2012)")

plot of chunk unnamed-chunk-13

However, the coding of our factors are not helping the direction of the analysis, since the number 5 is considered the highest in Democracy and in freedom. We make use of the package plyr to use the mapvalues function

library(plyr)
dataCountries$Freedom <- mapvalues(dataCountries$Freedom, from = c("1 free", 
    "3 partly free", "5 not free"), to = c("3 Free", "2 Partly Free", "1 Not Free"))
dataCountries$Democracy = as.numeric(dataCountries$Democracy)
dataCountries$Democracy <- mapvalues(dataCountries$Democracy, from = c(1, 2, 
    3, 4), to = c("4 full", "3 flawed", "2 hybrid", "1 authoritarian"))

So our final plot will be redrawn like this:

qplot(HDI, Terrorism, data = dataCountries, size = I(2), colour = Continent, 
    facets = Freedom ~ Democracy, geom = "point") + labs(title = "Human Development (HDI) and Terrorism by level of Freedom and Democracy \n (World Countries by Continent, year 2012)")

plot of chunk unnamed-chunk-15

TIME TO PLOT (II)

Can we make this better?

In this case we present juxtaposed horizontal bars panels for several categorical variables.

First we create organize the data:

Income = c(1.6, 4.7, 6.3)
Race = c(1.9, 2.2, 2.8, 4.4)
Sex = c(3.5, 4.5)
Married <- c(3.7, 5.2)

names(Income) = c("Under $25,000", "$25,000 to $50,000", "Over $50,000")
names(Race) = c("Blacks", "Hispanics", "Asians and Pacific Islanders", "Whites")
names(Sex) = c("Female", "Male")
names(Married) <- c("With Children Under 18", "Without Children")
# ...and ages
a18.24 = c(0.4, 1.5, 1.2, 0.7)
a25.34 = c(1, 1.5, 1.1, 0.5)
a35.44 = c(1.7, 1.2, 1.4, 0.6)
a45.54 = c(1.9, 1.5, 1.7, 0.9)
a55.64 = c(1.2, 1.6, 1.7, 0.8)
a65.up = c(0.4, 1.1, 0.9, 0.5)
Age <- c(2.3, sum(a18.24), sum(a25.34), sum(a35.44), sum(a45.54), sum(a55.64), 
    2.9)
names(Age) <- c("Under 18", "18 to 24", "25 to 34", "35 to 44", "45 to 54", 
    "55 to 64", "Over 65")

# The list of data organized:
datList = list(Sex = Sex, Married = Married, Income = Income, Race = Race, Age = Age)

The our plotting function (note we are calling another function inside this one using “source()”):

stackedBarPlots <- function(vectorList, smallVal, xshift = 0.03, yshift = 0.05, 
    main = "", xlab = "", xPad = 1.04, titleCex = 1.2, textCex = 0.86, textCol = "black", 
    textFont = 2, numberCol = "white", numberFont = 2, decimals = 1, labCex = 1, 
    font = 1, panFill = c(rgb(0.75, 0.93, 1), rgb(0.85, 0.78, 1)), panBorder = c("black", 
        "black"), barFill = c(rgb(0, 0.45, 1), rgb(0.4, 0.2, 0.8)), barBorder = c("white", 
        "white"), barHeight = 2/3) {
    source("panelFunctions.R")
    nPan <- length(vectorList)
    xMax <- xPad * max(unlist(vectorList))
    xGrid = panelInbounds(c(0, xMax))
    tmp <- sapply(vectorList, length)
    vecNam <- names(tmp)
    rowSize <- as.numeric(tmp) + 1
    pan = panelLayout(nr = nPan, nc = 1, leftMar = 0.8, topMar = 0.5, bottomMar = 0.7, 
        rowSize = rowSize)
    panBlock = panelLayout(nr = 1, nc = 1, leftMar = 0.8, topMar = 0.5, bottomMar = 0.7)
    nam <- names(datList)
    dy <- barHeight/2
    for (i in 1:nPan) {
        x = datList[[i]]
        xNam <- names(x)
        nbar = length(x)
        y = nbar:1
        panelSelect(pan, i, 1)
        panelScale(rx = c(0, xMax), ry = c(0.5, nbar + 0.5))
        flip = (i + 1)%%2 + 1
        panelFill(col = panFill[flip])
        panelGrid(x = xGrid, col = "white", lwd = 2)
        flip = (i + 1)%%2 + 1  # alternate fill color
        rect(0, y - dy, x, y + dy, col = barFill[flip], border = barBorder[flip])
        for (j in 1:nbar) {
            if (x[j] < smallVal) {
                text(x[j] + xshift, y[j] + yshift, paste(format(x[j], textDecimal), 
                  xNam[i], sep = " : "), col = numberCol, adj = c(0, 0.5), cex = textCex, 
                  font = textFont)
            } else {
                text(x[j] - xshift, y[j] + yshift, format(x[j], decimals), col = textCol, 
                  adj = 1, cex = textCex, font = textFont)
                text(xshift, y[j] + yshift, names(x[j]), col = textCol, adj = 0, 
                  cex = textCex, font = textFont)
            }
        }
        panelOutline()
        mtext(text = nam[i], side = 2, line = 0.5, las = 2, cex = labCex)
        if (i == nPan) {
            axis(side = 1, at = xGrid, tck = 0, mgp = c(2, 0.15, 0), cex.axis = labCex)
            mtext(xlab, side = 1, line = 1.5, cex = labCex)
        }
    }

    panelSelect(panBlock, 1, 1)
    panelScale()
    panelOutline()
    mtext(text = main, side = 3, line = 1, cex = titleCex)
}

And use the data with the function(s):

title = "Long Distance Trips Per Person for 1995"
stackedBarPlots(datList, smallVal = 1, xshift = 0.05, main = title, xlab = "Average Trips", 
    panFill = c(gray(0.9), gray(0.9)), barFill = c(rgb(34, 139, 34, 100, maxColorValue = 255), 
        rgb(255, 160, 122, 100, maxColorValue = 255)))

plot of chunk unnamed-chunk-18