Flowing Data: Bar & Line Charts

Bar Charts

Always start the value axis of bar charts at zero.

read.csv’s Colclasses argument: http://www.r-bloggers.com/using-colclasses-to-load-data-more-quickly-in-r/

getwd()

## [1] "/Users/hollyjones/Desktop/FlowingData"

setwd("/Users/hollyjones/Desktop/FlowingData/bar-charts-R-tutorial/data")
dir()

## [1] "ACS_08_3YR_S1903"         "income-2008-13.csv"      
## [3] "income-totals.csv"        "state_geocodes_v2011.csv"

#loading median household income data by state
income = read.csv("income-totals.csv", header=TRUE, sep = ",", colClasses = c("FIPS" = "character"))
head(income)

##            id FIPS       name households households_moe med_income
## 1 0400000US01   01    Alabama    1838683           5863      43253
## 2 0400000US02   02     Alaska     251899           1331      70760
## 3 0400000US04   04    Arizona    2370289           8769      49774
## 4 0400000US05   05   Arkansas    1129723           4304      40768
## 5 0400000US06   06 California   12542460          20542      61094
## 6 0400000US08   08   Colorado    1977591           4762      58433
##   med_income_moe med_min med_max
## 1            241   43012   43494
## 2            732   70028   71492
## 3            253   49521   50027
## 4            330   40438   41098
## 5            157   60937   61251
## 6            314   58119   58747

#default bar chart
barplot(income$med_income, main = "Default Chart")

#SORTING
sort_income = income[order(income$med_income, decreasing=FALSE), ]
head(sort_income)

##             id FIPS          name households households_moe med_income
## 52 0400000US72   72   Puerto Rico    1230868           3660      19624
## 25 0400000US28   28   Mississippi    1088073           4665      39031
## 4  0400000US05   05      Arkansas    1129723           4304      40768
## 49 0400000US54   54 West Virginia     741390           3047      41043
## 18 0400000US21   21      Kentucky    1694996           5311      43036
## 1  0400000US01   01       Alabama    1838683           5863      43253
##    med_income_moe med_min med_max
## 52            137   19487   19761
## 25            319   38712   39350
## 4             330   40438   41098
## 49            338   40705   41381
## 18            250   42786   43286
## 1             241   43012   43494

#use names.arg to label each bar
#horiz = TRUE to flip graph sideways
#LAS = change label orientation wrt axis- 0, 1, 2, 3 (parallel, horizontal, perpendicular, vertical)
#space = add space between bars
#PAR = sets margin sizes in following order: bottom, left, top, right
par(mar=c(5, 6, 2, 2))
barplot(sort_income$med_income, names.arg = sort_income$name, horiz=TRUE, las=2, cex.names = 0.65, cex.axis = 0.65, space = 0.9, col = "darkred", main = "Median Household Income, 2013", xlim = c(0, max(sort_income$med_income*1.1)), border=NA)

#turn on gridlines
grid(NULL, NA, lty="solid", lwd=2, col="black")

#bar width scales with # of bars
#separating states into regions
dir()

## [1] "ACS_08_3YR_S1903"         "income-2008-13.csv"      
## [3] "income-totals.csv"        "state_geocodes_v2011.csv"

region_info = read.csv("state_geocodes_v2011.csv", stringsAsFactors=FALSE, colClasses=c("Region"="character", "Division"="character", "FIPS"="character", "Name"="character"))
head(region_info)

##   Region Division FIPS                 Name
## 1      1        0   00     Northeast Region
## 2      1        1   00 New England Division
## 3      1        1   09          Connecticut
## 4      1        1   23                Maine
## 5      1        1   25        Massachusetts
## 6      1        1   33        New Hampshire

str(region_info)

## 'data.frame':    64 obs. of  4 variables:
##  $ Region  : chr  "1" "1" "1" "1" ...
##  $ Division: chr  "0" "1" "1" "1" ...
##  $ FIPS    : chr  "00" "00" "09" "23" ...
##  $ Name    : chr  "Northeast Region" "New England Division" "Connecticut" "Maine" ...

#merge two datasets on FIPS code
combined_dataset = merge(income, region_info, by="FIPS")
str(combined_dataset)

## 'data.frame':    51 obs. of  12 variables:
##  $ FIPS          : chr  "01" "02" "04" "05" ...
##  $ id            : Factor w/ 52 levels "0400000US01",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ name          : Factor w/ 52 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ households    : int  1838683 251899 2370289 1129723 12542460 1977591 1355849 335707 263649 7158980 ...
##  $ households_moe: int  5863 1331 8769 4304 20542 4762 3506 1773 1588 25790 ...
##  $ med_income    : int  43253 70760 49774 40768 61094 58433 69461 59878 65830 46956 ...
##  $ med_income_moe: int  241 732 253 330 157 314 411 714 1006 149 ...
##  $ med_min       : int  43012 70028 49521 40438 60937 58119 69050 59164 64824 46807 ...
##  $ med_max       : int  43494 71492 50027 41098 61251 58747 69872 60592 66836 47105 ...
##  $ Region        : chr  "3" "4" "4" "3" ...
##  $ Division      : chr  "6" "9" "8" "7" ...
##  $ Name          : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...

#there are 4 unique regions, as expected
unique(combined_dataset$Region)

## [1] "3" "4" "1" "2"

#division code of 0 corresponds to one of the four regions
subset(region_info, Division == "0")

##    Region Division FIPS             Name
## 1       1        0   00 Northeast Region
## 13      2        0   00   Midwest Region
## 28      3        0   00     South Region
## 49      4        0   00      West Region

#subset regions for graphing
NE = subset(combined_dataset, Region == "1", select = c(Name, med_income, Region))
NE = NE[order(NE$med_income, decreasing=TRUE), ]

MW = subset(combined_dataset, Region == "2", select = c(Name, med_income, Region))
MW = MW[order(MW$med_income, decreasing=TRUE), ]

S = subset(combined_dataset, Region == "3", select = c(Name, med_income, Region))
S = S[order(S$med_income, decreasing=TRUE), ]

W = subset(combined_dataset, Region == "4", select = c(Name, med_income, Region))
W = W[order(W$med_income, decreasing=TRUE), ]

#mar = bottom, left, top, right
par(mfrow=c(2,2), mar=c(5, 10, 4, 2))

barplot(NE$med_income, main = "Northeast Region", col = "blue", horiz = TRUE, names.arg = NE$Name, las=2, xlim=c(0, max(NE$med_income*1.3)))

#turn on gridlines
grid(NULL, NA, lty="solid", lwd=1, col="black")

barplot(MW$med_income, main = "Midwest Region", col = "gold", horiz = TRUE, names.arg = MW$Name, las=2, xlim=c(0, max(MW$med_income*1.3)))

#turn on gridlines
grid(NULL, NA, lty="solid", lwd=1, col="black")

barplot(S$med_income, main = "South Region", col = "green", horiz = TRUE, names.arg = S$Name, las=2, xlim=c(0, max(S$med_income*1.3)))

#turn on gridlines
grid(NULL, NA, lty="solid", lwd=1, col="black")

barplot(W$med_income, main = "West Region", col = "red", horiz = TRUE, names.arg = W$Name, las=2, xlim=c(0, max(W$med_income*1.3)))

#turn on gridlines
grid(NULL, NA, lty="solid", lwd=1, col="black")

Line Charts w/ Annotation

#return to default graphics settings w/ par
par(mfrow=c(1,1))


getwd()

## [1] "/Users/hollyjones/Desktop/FlowingData"

setwd("/Users/hollyjones/Desktop/FlowingData/lines/data")
dir()

## [1] "country-regions.csv"         "life-expectancy-cleaned.csv"

life = read.csv("life-expectancy-cleaned.csv", header=TRUE, sep = ",", stringsAsFactors = FALSE)
head(life)

##          Country.Name Country.Code    X1960  X1961  X1962  X1963  X1964
## 1               World          WLD 50.76101     NA     NA     NA     NA
## 2         Afghanistan          AFG 31.43600 31.767 32.113 32.475 32.853
## 3             Albania          ALB 61.30900 62.272 63.116 63.811 64.345
## 4             Algeria          DZA 45.96100 46.493 47.043 47.613 48.203
## 5              Angola          AGO 31.53900 31.935 32.331 32.726 33.121
## 6 Antigua and Barbuda          ATG 60.50000     NA     NA     NA     NA
##      X1965  X1966  X1967  X1968  X1969    X1970  X1971  X1972  X1973
## 1 53.98691     NA     NA     NA     NA 57.60103     NA     NA     NA
## 2 33.24500 33.646 34.055 34.466 34.878 35.29500 35.718 36.152 36.594
## 3 64.72700 64.983 65.164 65.314 65.457 65.60700 65.766 65.923 66.074
## 4 48.81000 49.428 50.050 50.673 51.294 51.91900 52.553 53.202 53.868
## 5 33.51600 33.912 34.310 34.709 35.108 35.50800 35.911 36.315 36.714
## 6       NA     NA     NA     NA     NA 65.00000     NA     NA     NA
##    X1974    X1975  X1976  X1977  X1978  X1979    X1980    X1981    X1982
## 1     NA 59.39263     NA     NA     NA     NA 60.88545 61.14777 61.42275
## 2 37.040 37.48200 37.910 38.318 38.700 39.056 39.38900 39.70500 40.01300
## 3 66.223 66.37900 66.546 66.728 66.925 67.140 67.37600 67.63900 67.92100
## 4 54.550 55.23800 55.919 56.585 57.236 57.877 58.53100 59.22900 59.98800
## 5 37.102 37.46900 37.805 38.102 38.357 38.568 38.73400 38.85900 38.95200
## 6     NA       NA     NA     NA     NA     NA       NA       NA 70.37000
##      X1983   X1984    X1985    X1986   X1987    X1988    X1989    X1990
## 1 61.63485 61.8717 62.13293 62.45411 62.7183 62.93638 63.15262 63.33469
## 2 40.31900 40.6260 40.93400 41.24000 41.5390 41.82800 42.10900 42.38400
## 3 68.21200 68.4960 68.74100 68.91200 68.9920 68.98400 68.90300 68.78300
## 4 60.80900 61.6760 62.55600 63.40600 64.1830 64.85700 65.41800 65.86300
## 5 39.02100 39.0750 39.11400 39.13300 39.1370 39.13400 39.14100 39.17800
## 6       NA      NA       NA       NA 70.9400       NA       NA       NA
##      X1991    X1992    X1993    X1994    X1995    X1996    X1997    X1998
## 1 63.51141 63.66851 63.75027 63.90795 64.08667 64.34908 64.60381 64.80676
## 2 42.65600 42.92900 43.20600 43.48900 43.77700 44.06900 44.36200 44.65500
## 3 68.67100 68.61400 68.64700 68.78700 69.04200 69.40400 69.83900 70.31000
## 4 66.20800 66.48700 66.73600 66.97300 67.22100 67.49400 67.79200 68.11300
## 5 39.26400 39.41600 39.64600 39.96500 40.38800 40.92600 41.56700 42.28700
## 6       NA 71.50000       NA       NA       NA       NA 72.00000       NA
##      X1999    X2000   X2001    X2002    X2003    X2004    X2005    X2006
## 1 64.98231 65.19298 65.4227 65.61474 65.83069 66.09486 66.30251 66.58488
## 2 44.94400 45.22700 45.4980 45.75900 46.01000 46.25900 46.51300 46.78300
## 3 70.79500 71.26800 71.7100 72.11000 72.46300 72.76400 73.01200 73.21500
## 4 68.45500 68.81100 69.1660 69.50600 69.81900 70.10100 70.35200 70.57700
## 5 43.06300 43.86200 44.6460 45.38900 46.06800 46.67200 47.19600 47.65300
## 6       NA       NA      NA 72.50000       NA       NA       NA       NA
##      X2007    X2008    X2009
## 1 66.85055 67.09668 67.36927
## 2 47.07800 47.40100 47.75300
## 3 73.39000 73.55100 73.70500
## 4 70.78700 70.99100 71.19300
## 5 48.06700 48.46400 48.85200
## 6       NA       NA       NA

#reshape data for easy plotting - turns cols representing each year into rows
library(reshape)
life2 = melt(life, id = c("Country.Name", "Country.Code"))

head(life2)

##          Country.Name Country.Code variable    value
## 1               World          WLD    X1960 50.76101
## 2         Afghanistan          AFG    X1960 31.43600
## 3             Albania          ALB    X1960 61.30900
## 4             Algeria          DZA    X1960 45.96100
## 5              Angola          AGO    X1960 31.53900
## 6 Antigua and Barbuda          ATG    X1960 60.50000

#change colname headings
library(dplyr)
colnames(life2) = c("country", "code", "year", "life-expectancy")

#remove "X" preceding each year in each row
life2$year = substring(life2$year, 2)
head(life2)

##               country code year life-expectancy
## 1               World  WLD 1960        50.76101
## 2         Afghanistan  AFG 1960        31.43600
## 3             Albania  ALB 1960        61.30900
## 4             Algeria  DZA 1960        45.96100
## 5              Angola  AGO 1960        31.53900
## 6 Antigua and Barbuda  ATG 1960        60.50000

tail(life2)

##                     country code year life-expectancy
## 10245               Vietnam  VNM 2009          72.689
## 10246 Virgin Islands (U.S.)  VIR 2009          76.103
## 10247    West Bank and Gaza  PSE 2009          70.875
## 10248           Yemen, Rep.  YEM 2009          63.164
## 10249                Zambia  ZMB 2009          47.416
## 10250              Zimbabwe  ZWE 2009          49.370

#DEFAULT EXAMPLE IS INCORRECT:
#must use for loop, check each item in specified column against unique values of that column
#plot a line for each unique value (in this case, country - as identified by country code)
plot(life2[ ,c(-1,-2)], xlab="Year", ylab="Life Expectancy", main = "Life Expectancy Across the Globe", type = "l")

#start with blank plot (use type = "n")
plot(0, 0, type="n", xlim=c(1960, 2010), ylim=c(25, 80), main="Life Expectancy, 1960 to 2010", ylab="Life Expectancy", xlab="", las=1, lwd=2, bty="n", cex.axis=0.7)

#identifying unique code for each country
#codes = COUNTRY CODES
codes = unique(life2$code)


#for loop for plotting all lines
for (i in 1:length(codes)) {
    currCountry = subset(life2, life2$code == codes[i])
    currCountry = currCountry[order(currCountry$year, decreasing=FALSE), ]
    lines(1960:2009, currCountry$`life-expectancy`, col="#00200070")
}

#subset to just sub-saharan african countries
library(sqldf)

countries = read.csv("country-regions.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
head(countries)

##   CountryCode RegionCode
## 1         AFG        SAS
## 2         ALB        ECS
## 3         DZA        MEA
## 4         AGO        SSF
## 5         ARG        LCN
## 6         ARM        ECS
##                                           RegionName
## 1                                         South Asia
## 2      Europe &amp; Central Asia (all income levels)
## 3 Middle East &amp; North Africa (all income levels)
## 4             Sub-Saharan Africa (all income levels)
## 5  Latin America &amp; Caribbean (all income levels)
## 6      Europe &amp; Central Asia (all income levels)

#life 3 dataset is life 2 with regional information
life3 = merge(life2, countries[, c("CountryCode", "RegionName")], by.x = "code", by.y = "CountryCode")
head(life3)

##   code country year life-expectancy
## 1  ABW   Aruba 1967          66.242
## 2  ABW   Aruba 2000          71.304
## 3  ABW   Aruba 1965          65.660
## 4  ABW   Aruba 2004          71.718
## 5  ABW   Aruba 2001          71.378
## 6  ABW   Aruba 1984          70.660
##                                          RegionName
## 1 Latin America &amp; Caribbean (all income levels)
## 2 Latin America &amp; Caribbean (all income levels)
## 3 Latin America &amp; Caribbean (all income levels)
## 4 Latin America &amp; Caribbean (all income levels)
## 5 Latin America &amp; Caribbean (all income levels)
## 6 Latin America &amp; Caribbean (all income levels)

#there are 7 unique regions
regions = unique(life3$RegionName)
regions

## [1] "Latin America &amp; Caribbean (all income levels)" 
## [2] "South Asia"                                        
## [3] "Sub-Saharan Africa (all income levels)"            
## [4] "Europe &amp; Central Asia (all income levels)"     
## [5] "Middle East &amp; North Africa (all income levels)"
## [6] "East Asia &amp; Pacific (all income levels)"       
## [7] "North America"

subsah = sqldf("SELECT * FROM life3 WHERE RegionName LIKE 'Sub-Saharan%'")
head(subsah)

##   code country year life-expectancy                             RegionName
## 1  AGO  Angola 1993          39.646 Sub-Saharan Africa (all income levels)
## 2  AGO  Angola 1973          36.714 Sub-Saharan Africa (all income levels)
## 3  AGO  Angola 1987          39.137 Sub-Saharan Africa (all income levels)
## 4  AGO  Angola 2004          46.672 Sub-Saharan Africa (all income levels)
## 5  AGO  Angola 1994          39.965 Sub-Saharan Africa (all income levels)
## 6  AGO  Angola 2005          47.196 Sub-Saharan Africa (all income levels)

dim(subsah)

## [1] 2400    5

#unique(subsah$RegionName)

#create another blank plot
plot(0, 0, type="n", xlim=c(1960, 2010), ylim=c(25, 80), main="Life Expectancy in Sub-Saharan Africa \n 1960 to 2010", ylab="Life Expectancy", xlab="", las=1, lwd=2, bty="n", cex.axis=0.7)

#find unique countries in the subsaharan region
subsah_codes = unique(subsah$code)
subsah_codes

##  [1] "AGO" "BDI" "BEN" "BFA" "BWA" "CAF" "CIV" "CMR" "COD" "COG" "COM"
## [12] "CPV" "ERI" "ETH" "GAB" "GHA" "GIN" "GMB" "GNB" "GNQ" "KEN" "LBR"
## [23] "LSO" "MDG" "MLI" "MOZ" "MRT" "MUS" "MWI" "MYT" "NAM" "NER" "NGA"
## [34] "RWA" "SDN" "SEN" "SLE" "SOM" "STP" "SWZ" "SYC" "TCD" "TGO" "TZA"
## [45] "UGA" "ZAF" "ZMB" "ZWE"

length(subsah_codes)

## [1] 48

#there are 48 unique countries in subsaharan africa

#another for loop to plot the lines
#subset(df, condition to subset on, optional select statement for columns to keep)
for(j in 1:length(subsah_codes)) {
  current_country = subset(subsah, subsah$code == subsah_codes[j])
  current_country = current_country[order(current_country$year, decreasing = FALSE), ]
  lines(1960:2009, current_country$`life-expectancy`, col="#00200070")
}

# Make Rwanda line more obvious
current_country = subset(subsah, code == "RWA")
current_country = current_country[order(current_country$year, decreasing=FALSE),]
lines(1960:2009, current_country$`life-expectancy`, col="red", lwd=2)

# Annotate
text(1993 + 0.9, min(current_country$`life-expectancy`)-0.4, "In 1993, Rwanda's average life\nexpectancy was 25 years.", cex=.95, font=3, pos=4)
symbols(1993, min(current_country$`life-expectancy`), circles=0.5, inches=FALSE, add=TRUE, lwd=2)

# Line type
plot(0, 0, type="n", xlim=c(1960, 2009), ylim=c(23, 80), main="Life Expectancy", ylab="Years from birth", xlab="", las=1, lwd=2, bty="n", cex.axis=0.8)


text(1960, 44, "Rwanda", pos=4)
lines(1960:2009, current_country$`life-expectancy`, col="#000000", lwd=2, lty=2)

head(life3)

##   code country year life-expectancy
## 1  ABW   Aruba 1967          66.242
## 2  ABW   Aruba 2000          71.304
## 3  ABW   Aruba 1965          65.660
## 4  ABW   Aruba 2004          71.718
## 5  ABW   Aruba 2001          71.378
## 6  ABW   Aruba 1984          70.660
##                                          RegionName
## 1 Latin America &amp; Caribbean (all income levels)
## 2 Latin America &amp; Caribbean (all income levels)
## 3 Latin America &amp; Caribbean (all income levels)
## 4 Latin America &amp; Caribbean (all income levels)
## 5 Latin America &amp; Caribbean (all income levels)
## 6 Latin America &amp; Caribbean (all income levels)

usa = sqldf("SELECT * FROM life3 WHERE code LIKE '%USA%' ")
usa = usa[order(usa$year, decreasing = FALSE), ]

text(1960, 69, "United States", pos=4)
lines(1960:2009, usa$`life-expectancy`, col="#000000", lwd=2, lty=3)

Flowing Data: Bar & Line Charts

Holly Jones

2016-07-17

Bar Charts

Line Charts w/ Annotation