Always start the value axis of bar charts at zero.
read.csv’s Colclasses argument: http://www.r-bloggers.com/using-colclasses-to-load-data-more-quickly-in-r/
getwd()## [1] "/Users/hollyjones/Desktop/FlowingData"
setwd("/Users/hollyjones/Desktop/FlowingData/bar-charts-R-tutorial/data")
dir()## [1] "ACS_08_3YR_S1903" "income-2008-13.csv"
## [3] "income-totals.csv" "state_geocodes_v2011.csv"
#loading median household income data by state
income = read.csv("income-totals.csv", header=TRUE, sep = ",", colClasses = c("FIPS" = "character"))
head(income)## id FIPS name households households_moe med_income
## 1 0400000US01 01 Alabama 1838683 5863 43253
## 2 0400000US02 02 Alaska 251899 1331 70760
## 3 0400000US04 04 Arizona 2370289 8769 49774
## 4 0400000US05 05 Arkansas 1129723 4304 40768
## 5 0400000US06 06 California 12542460 20542 61094
## 6 0400000US08 08 Colorado 1977591 4762 58433
## med_income_moe med_min med_max
## 1 241 43012 43494
## 2 732 70028 71492
## 3 253 49521 50027
## 4 330 40438 41098
## 5 157 60937 61251
## 6 314 58119 58747
#default bar chart
barplot(income$med_income, main = "Default Chart")#SORTING
sort_income = income[order(income$med_income, decreasing=FALSE), ]
head(sort_income)## id FIPS name households households_moe med_income
## 52 0400000US72 72 Puerto Rico 1230868 3660 19624
## 25 0400000US28 28 Mississippi 1088073 4665 39031
## 4 0400000US05 05 Arkansas 1129723 4304 40768
## 49 0400000US54 54 West Virginia 741390 3047 41043
## 18 0400000US21 21 Kentucky 1694996 5311 43036
## 1 0400000US01 01 Alabama 1838683 5863 43253
## med_income_moe med_min med_max
## 52 137 19487 19761
## 25 319 38712 39350
## 4 330 40438 41098
## 49 338 40705 41381
## 18 250 42786 43286
## 1 241 43012 43494
#use names.arg to label each bar
#horiz = TRUE to flip graph sideways
#LAS = change label orientation wrt axis- 0, 1, 2, 3 (parallel, horizontal, perpendicular, vertical)
#space = add space between bars
#PAR = sets margin sizes in following order: bottom, left, top, right
par(mar=c(5, 6, 2, 2))
barplot(sort_income$med_income, names.arg = sort_income$name, horiz=TRUE, las=2, cex.names = 0.65, cex.axis = 0.65, space = 0.9, col = "darkred", main = "Median Household Income, 2013", xlim = c(0, max(sort_income$med_income*1.1)), border=NA)
#turn on gridlines
grid(NULL, NA, lty="solid", lwd=2, col="black")#bar width scales with # of bars
#separating states into regions
dir()## [1] "ACS_08_3YR_S1903" "income-2008-13.csv"
## [3] "income-totals.csv" "state_geocodes_v2011.csv"
region_info = read.csv("state_geocodes_v2011.csv", stringsAsFactors=FALSE, colClasses=c("Region"="character", "Division"="character", "FIPS"="character", "Name"="character"))
head(region_info)## Region Division FIPS Name
## 1 1 0 00 Northeast Region
## 2 1 1 00 New England Division
## 3 1 1 09 Connecticut
## 4 1 1 23 Maine
## 5 1 1 25 Massachusetts
## 6 1 1 33 New Hampshire
str(region_info)## 'data.frame': 64 obs. of 4 variables:
## $ Region : chr "1" "1" "1" "1" ...
## $ Division: chr "0" "1" "1" "1" ...
## $ FIPS : chr "00" "00" "09" "23" ...
## $ Name : chr "Northeast Region" "New England Division" "Connecticut" "Maine" ...
#merge two datasets on FIPS code
combined_dataset = merge(income, region_info, by="FIPS")
str(combined_dataset)## 'data.frame': 51 obs. of 12 variables:
## $ FIPS : chr "01" "02" "04" "05" ...
## $ id : Factor w/ 52 levels "0400000US01",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ name : Factor w/ 52 levels "Alabama","Alaska",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ households : int 1838683 251899 2370289 1129723 12542460 1977591 1355849 335707 263649 7158980 ...
## $ households_moe: int 5863 1331 8769 4304 20542 4762 3506 1773 1588 25790 ...
## $ med_income : int 43253 70760 49774 40768 61094 58433 69461 59878 65830 46956 ...
## $ med_income_moe: int 241 732 253 330 157 314 411 714 1006 149 ...
## $ med_min : int 43012 70028 49521 40438 60937 58119 69050 59164 64824 46807 ...
## $ med_max : int 43494 71492 50027 41098 61251 58747 69872 60592 66836 47105 ...
## $ Region : chr "3" "4" "4" "3" ...
## $ Division : chr "6" "9" "8" "7" ...
## $ Name : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
#there are 4 unique regions, as expected
unique(combined_dataset$Region)## [1] "3" "4" "1" "2"
#division code of 0 corresponds to one of the four regions
subset(region_info, Division == "0")## Region Division FIPS Name
## 1 1 0 00 Northeast Region
## 13 2 0 00 Midwest Region
## 28 3 0 00 South Region
## 49 4 0 00 West Region
#subset regions for graphing
NE = subset(combined_dataset, Region == "1", select = c(Name, med_income, Region))
NE = NE[order(NE$med_income, decreasing=TRUE), ]
MW = subset(combined_dataset, Region == "2", select = c(Name, med_income, Region))
MW = MW[order(MW$med_income, decreasing=TRUE), ]
S = subset(combined_dataset, Region == "3", select = c(Name, med_income, Region))
S = S[order(S$med_income, decreasing=TRUE), ]
W = subset(combined_dataset, Region == "4", select = c(Name, med_income, Region))
W = W[order(W$med_income, decreasing=TRUE), ]
#mar = bottom, left, top, right
par(mfrow=c(2,2), mar=c(5, 10, 4, 2))
barplot(NE$med_income, main = "Northeast Region", col = "blue", horiz = TRUE, names.arg = NE$Name, las=2, xlim=c(0, max(NE$med_income*1.3)))
#turn on gridlines
grid(NULL, NA, lty="solid", lwd=1, col="black")
barplot(MW$med_income, main = "Midwest Region", col = "gold", horiz = TRUE, names.arg = MW$Name, las=2, xlim=c(0, max(MW$med_income*1.3)))
#turn on gridlines
grid(NULL, NA, lty="solid", lwd=1, col="black")
barplot(S$med_income, main = "South Region", col = "green", horiz = TRUE, names.arg = S$Name, las=2, xlim=c(0, max(S$med_income*1.3)))
#turn on gridlines
grid(NULL, NA, lty="solid", lwd=1, col="black")
barplot(W$med_income, main = "West Region", col = "red", horiz = TRUE, names.arg = W$Name, las=2, xlim=c(0, max(W$med_income*1.3)))
#turn on gridlines
grid(NULL, NA, lty="solid", lwd=1, col="black")#return to default graphics settings w/ par
par(mfrow=c(1,1))
getwd()## [1] "/Users/hollyjones/Desktop/FlowingData"
setwd("/Users/hollyjones/Desktop/FlowingData/lines/data")
dir()## [1] "country-regions.csv" "life-expectancy-cleaned.csv"
life = read.csv("life-expectancy-cleaned.csv", header=TRUE, sep = ",", stringsAsFactors = FALSE)
head(life)## Country.Name Country.Code X1960 X1961 X1962 X1963 X1964
## 1 World WLD 50.76101 NA NA NA NA
## 2 Afghanistan AFG 31.43600 31.767 32.113 32.475 32.853
## 3 Albania ALB 61.30900 62.272 63.116 63.811 64.345
## 4 Algeria DZA 45.96100 46.493 47.043 47.613 48.203
## 5 Angola AGO 31.53900 31.935 32.331 32.726 33.121
## 6 Antigua and Barbuda ATG 60.50000 NA NA NA NA
## X1965 X1966 X1967 X1968 X1969 X1970 X1971 X1972 X1973
## 1 53.98691 NA NA NA NA 57.60103 NA NA NA
## 2 33.24500 33.646 34.055 34.466 34.878 35.29500 35.718 36.152 36.594
## 3 64.72700 64.983 65.164 65.314 65.457 65.60700 65.766 65.923 66.074
## 4 48.81000 49.428 50.050 50.673 51.294 51.91900 52.553 53.202 53.868
## 5 33.51600 33.912 34.310 34.709 35.108 35.50800 35.911 36.315 36.714
## 6 NA NA NA NA NA 65.00000 NA NA NA
## X1974 X1975 X1976 X1977 X1978 X1979 X1980 X1981 X1982
## 1 NA 59.39263 NA NA NA NA 60.88545 61.14777 61.42275
## 2 37.040 37.48200 37.910 38.318 38.700 39.056 39.38900 39.70500 40.01300
## 3 66.223 66.37900 66.546 66.728 66.925 67.140 67.37600 67.63900 67.92100
## 4 54.550 55.23800 55.919 56.585 57.236 57.877 58.53100 59.22900 59.98800
## 5 37.102 37.46900 37.805 38.102 38.357 38.568 38.73400 38.85900 38.95200
## 6 NA NA NA NA NA NA NA NA 70.37000
## X1983 X1984 X1985 X1986 X1987 X1988 X1989 X1990
## 1 61.63485 61.8717 62.13293 62.45411 62.7183 62.93638 63.15262 63.33469
## 2 40.31900 40.6260 40.93400 41.24000 41.5390 41.82800 42.10900 42.38400
## 3 68.21200 68.4960 68.74100 68.91200 68.9920 68.98400 68.90300 68.78300
## 4 60.80900 61.6760 62.55600 63.40600 64.1830 64.85700 65.41800 65.86300
## 5 39.02100 39.0750 39.11400 39.13300 39.1370 39.13400 39.14100 39.17800
## 6 NA NA NA NA 70.9400 NA NA NA
## X1991 X1992 X1993 X1994 X1995 X1996 X1997 X1998
## 1 63.51141 63.66851 63.75027 63.90795 64.08667 64.34908 64.60381 64.80676
## 2 42.65600 42.92900 43.20600 43.48900 43.77700 44.06900 44.36200 44.65500
## 3 68.67100 68.61400 68.64700 68.78700 69.04200 69.40400 69.83900 70.31000
## 4 66.20800 66.48700 66.73600 66.97300 67.22100 67.49400 67.79200 68.11300
## 5 39.26400 39.41600 39.64600 39.96500 40.38800 40.92600 41.56700 42.28700
## 6 NA 71.50000 NA NA NA NA 72.00000 NA
## X1999 X2000 X2001 X2002 X2003 X2004 X2005 X2006
## 1 64.98231 65.19298 65.4227 65.61474 65.83069 66.09486 66.30251 66.58488
## 2 44.94400 45.22700 45.4980 45.75900 46.01000 46.25900 46.51300 46.78300
## 3 70.79500 71.26800 71.7100 72.11000 72.46300 72.76400 73.01200 73.21500
## 4 68.45500 68.81100 69.1660 69.50600 69.81900 70.10100 70.35200 70.57700
## 5 43.06300 43.86200 44.6460 45.38900 46.06800 46.67200 47.19600 47.65300
## 6 NA NA NA 72.50000 NA NA NA NA
## X2007 X2008 X2009
## 1 66.85055 67.09668 67.36927
## 2 47.07800 47.40100 47.75300
## 3 73.39000 73.55100 73.70500
## 4 70.78700 70.99100 71.19300
## 5 48.06700 48.46400 48.85200
## 6 NA NA NA
#reshape data for easy plotting - turns cols representing each year into rows
library(reshape)
life2 = melt(life, id = c("Country.Name", "Country.Code"))
head(life2)## Country.Name Country.Code variable value
## 1 World WLD X1960 50.76101
## 2 Afghanistan AFG X1960 31.43600
## 3 Albania ALB X1960 61.30900
## 4 Algeria DZA X1960 45.96100
## 5 Angola AGO X1960 31.53900
## 6 Antigua and Barbuda ATG X1960 60.50000
#change colname headings
library(dplyr)
colnames(life2) = c("country", "code", "year", "life-expectancy")
#remove "X" preceding each year in each row
life2$year = substring(life2$year, 2)
head(life2)## country code year life-expectancy
## 1 World WLD 1960 50.76101
## 2 Afghanistan AFG 1960 31.43600
## 3 Albania ALB 1960 61.30900
## 4 Algeria DZA 1960 45.96100
## 5 Angola AGO 1960 31.53900
## 6 Antigua and Barbuda ATG 1960 60.50000
tail(life2)## country code year life-expectancy
## 10245 Vietnam VNM 2009 72.689
## 10246 Virgin Islands (U.S.) VIR 2009 76.103
## 10247 West Bank and Gaza PSE 2009 70.875
## 10248 Yemen, Rep. YEM 2009 63.164
## 10249 Zambia ZMB 2009 47.416
## 10250 Zimbabwe ZWE 2009 49.370
#DEFAULT EXAMPLE IS INCORRECT:
#must use for loop, check each item in specified column against unique values of that column
#plot a line for each unique value (in this case, country - as identified by country code)
plot(life2[ ,c(-1,-2)], xlab="Year", ylab="Life Expectancy", main = "Life Expectancy Across the Globe", type = "l")#start with blank plot (use type = "n")
plot(0, 0, type="n", xlim=c(1960, 2010), ylim=c(25, 80), main="Life Expectancy, 1960 to 2010", ylab="Life Expectancy", xlab="", las=1, lwd=2, bty="n", cex.axis=0.7)
#identifying unique code for each country
#codes = COUNTRY CODES
codes = unique(life2$code)
#for loop for plotting all lines
for (i in 1:length(codes)) {
currCountry = subset(life2, life2$code == codes[i])
currCountry = currCountry[order(currCountry$year, decreasing=FALSE), ]
lines(1960:2009, currCountry$`life-expectancy`, col="#00200070")
}#subset to just sub-saharan african countries
library(sqldf)
countries = read.csv("country-regions.csv", header=TRUE, sep=",", stringsAsFactors = FALSE)
head(countries)## CountryCode RegionCode
## 1 AFG SAS
## 2 ALB ECS
## 3 DZA MEA
## 4 AGO SSF
## 5 ARG LCN
## 6 ARM ECS
## RegionName
## 1 South Asia
## 2 Europe & Central Asia (all income levels)
## 3 Middle East & North Africa (all income levels)
## 4 Sub-Saharan Africa (all income levels)
## 5 Latin America & Caribbean (all income levels)
## 6 Europe & Central Asia (all income levels)
#life 3 dataset is life 2 with regional information
life3 = merge(life2, countries[, c("CountryCode", "RegionName")], by.x = "code", by.y = "CountryCode")
head(life3)## code country year life-expectancy
## 1 ABW Aruba 1967 66.242
## 2 ABW Aruba 2000 71.304
## 3 ABW Aruba 1965 65.660
## 4 ABW Aruba 2004 71.718
## 5 ABW Aruba 2001 71.378
## 6 ABW Aruba 1984 70.660
## RegionName
## 1 Latin America & Caribbean (all income levels)
## 2 Latin America & Caribbean (all income levels)
## 3 Latin America & Caribbean (all income levels)
## 4 Latin America & Caribbean (all income levels)
## 5 Latin America & Caribbean (all income levels)
## 6 Latin America & Caribbean (all income levels)
#there are 7 unique regions
regions = unique(life3$RegionName)
regions## [1] "Latin America & Caribbean (all income levels)"
## [2] "South Asia"
## [3] "Sub-Saharan Africa (all income levels)"
## [4] "Europe & Central Asia (all income levels)"
## [5] "Middle East & North Africa (all income levels)"
## [6] "East Asia & Pacific (all income levels)"
## [7] "North America"
subsah = sqldf("SELECT * FROM life3 WHERE RegionName LIKE 'Sub-Saharan%'")
head(subsah)## code country year life-expectancy RegionName
## 1 AGO Angola 1993 39.646 Sub-Saharan Africa (all income levels)
## 2 AGO Angola 1973 36.714 Sub-Saharan Africa (all income levels)
## 3 AGO Angola 1987 39.137 Sub-Saharan Africa (all income levels)
## 4 AGO Angola 2004 46.672 Sub-Saharan Africa (all income levels)
## 5 AGO Angola 1994 39.965 Sub-Saharan Africa (all income levels)
## 6 AGO Angola 2005 47.196 Sub-Saharan Africa (all income levels)
dim(subsah)## [1] 2400 5
#unique(subsah$RegionName)
#create another blank plot
plot(0, 0, type="n", xlim=c(1960, 2010), ylim=c(25, 80), main="Life Expectancy in Sub-Saharan Africa \n 1960 to 2010", ylab="Life Expectancy", xlab="", las=1, lwd=2, bty="n", cex.axis=0.7)
#find unique countries in the subsaharan region
subsah_codes = unique(subsah$code)
subsah_codes## [1] "AGO" "BDI" "BEN" "BFA" "BWA" "CAF" "CIV" "CMR" "COD" "COG" "COM"
## [12] "CPV" "ERI" "ETH" "GAB" "GHA" "GIN" "GMB" "GNB" "GNQ" "KEN" "LBR"
## [23] "LSO" "MDG" "MLI" "MOZ" "MRT" "MUS" "MWI" "MYT" "NAM" "NER" "NGA"
## [34] "RWA" "SDN" "SEN" "SLE" "SOM" "STP" "SWZ" "SYC" "TCD" "TGO" "TZA"
## [45] "UGA" "ZAF" "ZMB" "ZWE"
length(subsah_codes)## [1] 48
#there are 48 unique countries in subsaharan africa
#another for loop to plot the lines
#subset(df, condition to subset on, optional select statement for columns to keep)
for(j in 1:length(subsah_codes)) {
current_country = subset(subsah, subsah$code == subsah_codes[j])
current_country = current_country[order(current_country$year, decreasing = FALSE), ]
lines(1960:2009, current_country$`life-expectancy`, col="#00200070")
}
# Make Rwanda line more obvious
current_country = subset(subsah, code == "RWA")
current_country = current_country[order(current_country$year, decreasing=FALSE),]
lines(1960:2009, current_country$`life-expectancy`, col="red", lwd=2)
# Annotate
text(1993 + 0.9, min(current_country$`life-expectancy`)-0.4, "In 1993, Rwanda's average life\nexpectancy was 25 years.", cex=.95, font=3, pos=4)
symbols(1993, min(current_country$`life-expectancy`), circles=0.5, inches=FALSE, add=TRUE, lwd=2)# Line type
plot(0, 0, type="n", xlim=c(1960, 2009), ylim=c(23, 80), main="Life Expectancy", ylab="Years from birth", xlab="", las=1, lwd=2, bty="n", cex.axis=0.8)
text(1960, 44, "Rwanda", pos=4)
lines(1960:2009, current_country$`life-expectancy`, col="#000000", lwd=2, lty=2)
head(life3)## code country year life-expectancy
## 1 ABW Aruba 1967 66.242
## 2 ABW Aruba 2000 71.304
## 3 ABW Aruba 1965 65.660
## 4 ABW Aruba 2004 71.718
## 5 ABW Aruba 2001 71.378
## 6 ABW Aruba 1984 70.660
## RegionName
## 1 Latin America & Caribbean (all income levels)
## 2 Latin America & Caribbean (all income levels)
## 3 Latin America & Caribbean (all income levels)
## 4 Latin America & Caribbean (all income levels)
## 5 Latin America & Caribbean (all income levels)
## 6 Latin America & Caribbean (all income levels)
usa = sqldf("SELECT * FROM life3 WHERE code LIKE '%USA%' ")
usa = usa[order(usa$year, decreasing = FALSE), ]
text(1960, 69, "United States", pos=4)
lines(1960:2009, usa$`life-expectancy`, col="#000000", lwd=2, lty=3)