author: Jeanne Spicer
date: January 2014 (PSU R User Group Mtg)
Options
Why plain text formats?
This downloads and reads a delimited file into R
read.table("http://www.ats.ucla.edu/stat/r/faq/test.txt")
## V1 V2 V3 V4 V5
## 1 make model mpg weight price
## 2 AMC Concord 22 2930 4099
## 3 AMC Pacer 17 3350 4749
## 4 AMC Spirit 22 2640 3799
## 5 Buick Century 20 3250 4816
## 6 Buick Electra 15 4080 7827
Use the XML package…
library(XML)
wikiURL1 <- "http://en.wikipedia.org/wiki/States_and_union_territories_of_India"
wikiURL2 <- "http://en.wikipedia.org/wiki/List_of_states_and_territories_of_India_by_area"
# HTML pages display numbers in ways that are not numeric But R will try to
# factor them
options(stringsAsFactors = FALSE)
tablesfromURL1 <- readHTMLTable(wikiURL1)
tablesfromURL2 <- readHTMLTable(wikiURL2)
# Display what was extracted str(tablesfromURL1)
Double square brackets
statesRaw <- tablesfromURL1[["States of India"]]
samestates <- tablesfromURL1[[3]] # Using list index
utRaw <- tablesfromURL1[["Union Territories"]]
regions <- tablesfromURL2[[2]]
Easy renaming
library(plyr)
# rename
regions <- rename(regions, c(State = "Name", `Area (km²)[4]` = "AreaKm"))
statesRaw <- rename(statesRaw, c(`Literacy Rate(%)` = "Literacy", `Percentage of Urban Population to total Population` = "pcturban"))
An rbind that is not fussy (but watch out)
allstates <- rbind.fill(statesRaw, utRaw)
names(allstates)
## [1] "Map"
## [2] "Name"
## [3] "ISO 3166-2 code[2]"
## [4] "Population"
## [5] "Area\n(km2)"
## [6] "Official\nlanguage(s)"
## [7] "Capital"
## [8] "Largest city(if not capital)"
## [9] "Population density"
## [10] "Literacy"
## [11] "pcturban"
## [12] "Sex Ratio"
## [13] "Official\nlanguage"
## [14] "Literacy Rate(%)"
## [15] "Percentage of Urban Population to total Population"
utRaw <- rename(utRaw, c(`Literacy Rate(%)` = "Literacy", `Percentage of Urban Population to total Population` = "pcturban"))
allstates <- rbind.fill(statesRaw, utRaw)
india <- join(allstates, regions, by = "Name", type = "left")
head(india)
## Map Name ISO 3166-2 code[2] Population Area\n(km2)
## 1 1 Andhra Pradesh AP 84,665,533 275,045
## 2 2 Arunachal Pradesh AR 1,382,611 83,743
## 3 3 Assam AS 31,169,272 78,550
## 4 4 Bihar BR 103,804,637 99,200
## 5 5 Chhattisgarh CT 25,540,196 135,194
## 6 6 Goa GA 1,457,723 3,702
## Official\nlanguage(s) Capital Largest city(if not capital)
## 1 Telugu, Urdu, English Hyderabad
## 2 English Itanagar
## 3 Assamese; Regional: Bodo, Bengali Dispur Guwahati
## 4 Hindi, Magadhi, Maithili, Urdu Patna
## 5 Chattisgarhi, Hindi Raipur
## 6 Konkani Panjim
## Population density Literacy pcturban Sex Ratio Official\nlanguage Rank
## 1 308 67.66 27.3 992 <NA> 4
## 2 17 66.95 20.8 920 <NA> 14
## 3 397 73.18 12.9 954 <NA> 16
## 4 1,102 63.82 10.5 916[3] <NA> 12
## 5 189 71.04 20.1 991 <NA> 10
## 6 394 87.40 62.2 968 <NA> 29
## AreaKm Region Comparable country
## 1 [note 2] 275,045 Southern New Zealand
## 2 83,743 Eastern United Arab Emirates
## 3 78,438 Eastern Ireland
## 4 94,163 East India South Korea
## 5 [note 1] 135,191 Central India Greece
## 6 3,702 Western Luxembourg
datasource <- paste("This file was pulled from Wikipedia page http://en.wikipedia.org/wiki/States_and_union_territories_of_India on ",
Sys.Date())
comment(statesRaw) <- datasource
# your annotation stays with dataframe
str(statesRaw)
## 'data.frame': 28 obs. of 12 variables:
## $ Map : chr "1" "2" "3" "4" ...
## $ Name : chr "Andhra Pradesh" "Arunachal Pradesh" "Assam" "Bihar" ...
## $ ISO 3166-2 code[2] : chr "AP" "AR" "AS" "BR" ...
## $ Population : chr "84,665,533" "1,382,611" "31,169,272" "103,804,637" ...
## $ Area
## (km2) : chr "275,045" "83,743" "78,550" "99,200" ...
## $ Official
## language(s) : chr "Telugu, Urdu, English" "English" "Assamese; Regional: Bodo, Bengali" "Hindi, Magadhi, Maithili, Urdu" ...
## $ Capital : chr "Hyderabad" "Itanagar" "Dispur" "Patna" ...
## $ Largest city(if not capital): chr "" "" "Guwahati" "" ...
## $ Population density : chr "308" "17" "397" "1,102" ...
## $ Literacy : chr "67.66" "66.95" "73.18" "63.82" ...
## $ pcturban : chr "27.3" "20.8" "12.9" "10.5" ...
## $ Sex Ratio : chr "992" "920" "954" "916[3]" ...
## - attr(*, "comment")= chr "This file was pulled from Wikipedia page http://en.wikipedia.org/wiki/States_and_union_territories_of_India on 2014-01-23"
save(statesRaw, file = "statesRaw.rda")
# If you need to restore the file, uncommment the line below
# load(file='statesRaw.rda')
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:plyr':
##
## arrange, desc, failwith, id, mutate, summarise
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
sessionInfo()
## R version 3.0.2 (2013-09-25)
## Platform: x86_64-redhat-linux-gnu (64-bit)
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dplyr_0.1 plyr_1.8 XML_3.98-1.1 knitr_1.5
##
## loaded via a namespace (and not attached):
## [1] assertthat_0.1 evaluate_0.5.1 formatR_0.10 Rcpp_0.10.6
## [5] stringr_0.6.2 tools_3.0.2
search()
## [1] ".GlobalEnv" "package:dplyr" "package:plyr"
## [4] "package:XML" "package:knitr" "package:stats"
## [7] "package:graphics" "package:grDevices" "package:utils"
## [10] "package:datasets" "package:methods" "Autoloads"
## [13] "package:base"
`?`(summarise)
## Help on topic 'summarise' was found in the following packages:
##
## Package Library
## dplyr /nfs/users/xn8/R/x86_64-redhat-linux-gnu-library/3.0
## plyr /usr/lib64/R/library
##
##
## Using the first match ...
Commands work on dataframes, table_dfs or grouped_dfs
Streamlined syntax for joins
india_df <- left_join(allstates, regions, by = "Name")
Anti join handy for identifying what did not match
anti_join(allstates, regions, by = "Name")
## Map Name ISO 3166-2 code[2] Population
## 1 G Puducherry (Pondicherry) PY 1,244,464
## 2 F National Capital Territory of Delhi DL 16,753,235
## 3 20 Odisha [5] (Orissa) OR 41,947,358
## Area\n(km2) Official\nlanguage(s) Capital
## 1 <NA> <NA> Pondicherry
## 2 <NA> <NA> Delhi
## 3 155,820 Oriya Bhubaneswar
## Largest city(if not capital) Population density Literacy pcturban
## 1 <NA> 2,598 86.55 66.6
## 2 <NA> 11,297 86.34 93.2
## 3 269 73.45 15.0
## Sex Ratio Official\nlanguage
## 1 1,038 French and Tamil; Regional: Malayalam, Telugu
## 2 866 English, Hindi, Punjabi, Urdu
## 3 978 <NA>
sortRegions <- arrange(regions, Region, desc(Rank))
You can select by name and ranges of variables with :
head(select(statesRaw, c(Map, Capital:Literacy)))
## Map Capital Largest city(if not capital) Population density Literacy
## 1 1 Hyderabad 308 67.66
## 2 2 Itanagar 17 66.95
## 3 3 Dispur Guwahati 397 73.18
## 4 4 Patna 1,102 63.82
## 5 5 Raipur 189 71.04
## 6 6 Panjim 394 87.40
filter(regions, Region == "")
## Rank Name AreaKm Region Comparable country
## 1 India[note 1][note 2] 3,287,263
# Recoding
india <- mutate(india, Literacy = type.convert(Literacy, dec = "."))
# Adding new variables
india <- mutate(india, mean(Literacy), Type = ifelse(pcturban > 80, "Urban",
"Rural"))
summarise(india, mean(Literacy, na.rm = T))
## mean(Literacy, na.rm = T)
## 1 78.65
summarise(group_by(india, Region), mean(Literacy, na.rm = T))
## Source: local data frame [8 x 2]
##
## Region mean(Literacy, na.rm = T)
## 1 NA 82.11
## 2 Northern 72.71
## 3 North India 77.21
## 4 Western 82.87
## 5 Central India 70.84
## 6 East India 65.72
## 7 Eastern 79.35
## 8 Southern 82.67
indiaSummary <- summarise(group_by(india, Region), mean(Literacy, na.rm = T))
indiaMutant <- mutate(group_by(india, Region), mean(Literacy, na.rm = T))
This is a new dplyr class “grouped_df”
class(indiaMutant)
## [1] "grouped_df" "tbl_df" "tbl" "data.frame"
indiaMutant
## Source: local data frame [35 x 20]
## Groups: Region
##
## Map Name ISO 3166-2 code[2] Population
## 1 1 Andhra Pradesh AP 84,665,533
## 2 2 Arunachal Pradesh AR 1,382,611
## 3 3 Assam AS 31,169,272
## 4 4 Bihar BR 103,804,637
## 5 5 Chhattisgarh CT 25,540,196
## 6 6 Goa GA 1,457,723
## 7 7 Gujarat GJ 60,383,628
## 8 8 Haryana HR 25,353,081
## 9 9 Himachal Pradesh HP 6,856,509
## 10 10 Jammu and Kashmir JK 12,548,926
## 11 11 Jharkhand JH 32,966,238
## 12 12 Karnataka KA 61,130,704
## 13 13 Kerala KL 33,387,677
## 14 14 Madhya Pradesh MP 72,597,565
## 15 15 Maharashtra MH 112,372,972
## 16 16 Manipur MN 2,721,756
## 17 17 Meghalaya ML 2,964,007
## 18 18 Mizoram MZ 1,091,014
## 19 19 Nagaland NL 1,980,602
## 20 20 Odisha [5] (Orissa) OR 41,947,358
## 21 21 Punjab PB 27,704,236
## 22 22 Rajasthan RJ 68,621,012
## 23 23 Sikkim SK 607,688
## 24 24 Tamil Nadu TN 72,138,958
## 25 25 Tripura TR 3,671,032
## 26 26 Uttar Pradesh UP 199,581,477
## 27 27 Uttarakhand UT 10,116,752
## 28 28 West Bengal WB 91,347,736
## 29 A Andaman and Nicobar Islands AN 379,944
## 30 B Chandigarh CH 1,054,686
## 31 C Dadra and Nagar Haveli DN 342,853
## 32 D Daman and Diu DD 242,911
## 33 E Lakshadweep LD 64,429
## 34 F National Capital Territory of Delhi DL 16,753,235
## 35 G Puducherry (Pondicherry) PY 1,244,464
## Variables not shown: Area (km2) (chr), Official language(s) (chr), Capital
## (chr), Largest city(if not capital) (chr), Population density (chr),
## Literacy (dbl), pcturban (chr), Sex Ratio (chr), Official language
## (chr), Rank (chr), AreaKm (chr), Region (chr), Comparable country (chr),
## mean(Literacy) (dbl), Type (chr), mean(Literacy, na.rm = T) (dbl)
methods(class = "grouped_df")
## [1] as.data.frame.grouped_df* do.grouped_df*
## [3] group_size.grouped_df* groups.grouped_df*
## [5] print.grouped_df* select.grouped_df*
## [7] ungroup.grouped_df*
##
## Non-visible functions are asterisked
Remembers the grouping of your table so that subsequent commands acknowledge the grouping.
summarise(indiaMutant, max(Literacy, na.rm = T))
## Source: local data frame [8 x 2]
##
## Region max(Literacy, na.rm = T)
## 1 NA 86.55
## 2 Northern 76.68
## 3 North India 86.43
## 4 Western 87.40
## 5 Central India 71.04
## 6 East India 67.63
## 7 Eastern 91.58
## 8 Southern 93.91
indiaGrouped <- group_by(india, Region)
summarise(indiaGrouped, mean(Literacy, na.rm = T))
## Source: local data frame [8 x 2]
##
## Region mean(Literacy, na.rm = T)
## 1 NA 82.11
## 2 Northern 72.71
## 3 North India 77.21
## 4 Western 82.87
## 5 Central India 70.84
## 6 East India 65.72
## 7 Eastern 79.35
## 8 Southern 82.67
You can run all this stuff together, results are passed to the next function in sequence.
library(stringr) # Another great package by Hadley Wickham!
stateSubset <- chain(india_df, select(Map:Population, Capital, Region), mutate(Pop = as.numeric(str_replace_all(Population,
",", ""))), arrange(desc(Pop)), filter(Pop > 1e+07), group_by(Region))
groupCts <- summarise(stateSubset, count = n(), avepop = mean(Pop, na.rm = TRUE),
tot = sum(Pop, na.rm = TRUE))
groupCts.attached <- mutate(stateSubset, count = n(), avepop = mean(Pop, na.rm = TRUE),
tot = sum(Pop, na.rm = TRUE))
Reading Data into R:
http://www.ats.ucla.edu/stat/r/faq/inputdata_R.htm
plyr:
http://www.hselab.org/machinery/content/getting-started-r-plyr-and-ggplot2-group-analysis
plyr has parallel computing capabilities, so it is still worth learning the syntax
dplyr:
http://cran.rstudio.com/web/packages/dplyr/vignettes/introduction.html