Getting & Cleaning Data

Question 1

The American Community Survey distributes downloadable data about United States communities. Download the 2006 microdata survey about housing for the state of Idaho using download.file() from here:

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv

and load the data into R. The code book, describing the variable names is here:

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FPUMSDataDict06.pdf

Create a logical vector that identifies the households on greater than 10 acres who sold more than $10,000 worth of agriculture products. Assign that logical vector to the variable agricultureLogical. Apply the which() function like this to identify the rows of the data frame where the logical vector is TRUE.

which(agricultureLogical)

What are the first 3 values that result?

Solution

if(!file.exists(".data")){dir.create("./data")}

## Warning in dir.create("./data"): '.\data' already exists

fileUrl<-"http://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv"
download.file(fileUrl, destfile="./data/communities.csv")
communityData <- read.csv("./data/communities.csv")
agricultureLogical<-communityData$ACR==3 & communityData$AGS==6
head(which(agricultureLogical),3)

## [1] 125 238 262

Question 2

Using the jpeg package read in the following picture of your instructor into R

https://d396qusza40orc.cloudfront.net/getdata%2Fjeff.jpg

Use the parameter native=TRUE. What are the 30th and 80th quantiles of the resulting data? (some Linux systems may produce an answer 638 different for the 30th quantile)

#install.packages("jpeg")
library(jpeg)
download.file("https://d396qusza40orc.cloudfront.net/getdata%2Fjeff.jpg",paste0(getwd(),'/jeff.jpg'),mode='wb')
photo<-readJPEG("jeff.jpg",native=TRUE)
quantile (photo, na.rm=TRUE, probs=c(0.3,0.8))

##       30%       80% 
## -15259150 -10575416

Question 3

Load the Gross Domestic Product data for the 190 ranked countries in this data set:

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv

Load the educational data from this data set:

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FEDSTATS_Country.csv

Match the data based on the country shortcode. How many of the IDs match? Sort the data frame in descending order by GDP rank (so United States is last). What is the 13th country in the resulting data frame?

Original data sources:

http://data.worldbank.org/data-catalog/GDP-ranking-table

http://data.worldbank.org/data-catalog/ed-stats

library(plyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

url<-"https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv"
url2<-"https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FEDSTATS_Country.csv"
download.file(url,paste0(getwd(),'/GDP.csv'),mode='wb')
download.file(url2,paste0(getwd(),'/EDU.csv'),mode='wb')
EDU<-read.csv("EDU.csv",
              stringsAsFactors = FALSE)
GDP<-read.csv("GDP.csv",
              stringsAsFactors = FALSE,
              na.strings = NULL,
              skip=4,
              colClasses=c(rep(NA,2),"NULL",rep(NA,2),rep("NULL",5)))
#colClasses=c(NA,NA,"NULL",NA,NA,"NULL", "NULL","NULL","NULL","NULL"))
colnames(GDP)<-c(X="CountryCode", X.1="Rank",X.3="CountryName",X.4="GDP")
#str(GDP)
GDP$Rank<-as.numeric(GDP$Rank)

## Warning: NAs introduced by coercion

GDP1<- GDP %>% filter(!is.na(Rank))
mergedata<-merge(GDP1,EDU,by="CountryCode")
dim(mergedata)

## [1] 189  34

arrange(mergedata, desc(Rank))[13,3]

## [1] "St. Kitts and Nevis"

Question 4

What is the average GDP ranking for the “High income: OECD” and “High income: nonOECD” group?

#Using tapply
tapply(mergedata$Rank,mergedata$Income.Group,mean)

## High income: nonOECD    High income: OECD           Low income 
##             91.91304             32.96667            133.72973 
##  Lower middle income  Upper middle income 
##            107.70370             92.13333

#Using lapply
lapply(split(mergedata$Rank,mergedata$Income.Group),mean)

## $`High income: nonOECD`
## [1] 91.91304
## 
## $`High income: OECD`
## [1] 32.96667
## 
## $`Low income`
## [1] 133.7297
## 
## $`Lower middle income`
## [1] 107.7037
## 
## $`Upper middle income`
## [1] 92.13333

#Using dplyr
mergedata %>% group_by(Income.Group) %>%
  filter("High income: OECD" %in% Income.Group | "High income: nonOECD" %in% Income.Group) %>%
  summarize(Average = mean(Rank, na.rm = T))

## # A tibble: 2 x 2
##   Income.Group         Average
##   <chr>                  <dbl>
## 1 High income: nonOECD    91.9
## 2 High income: OECD       33.0

Question 5

Cut the GDP ranking into 5 separate quantile groups. Make a table versus Income.Group. How many countries

are Lower middle income but among the 38 nations with highest GDP?

#Using Hmisc
#install.packages("Hmisc")
library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:plyr':
## 
##     is.discrete, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

GDP_group=cut2(mergedata$Rank, g=5)
table(GDP_group,mergedata$Income.Group)

##            
## GDP_group   High income: nonOECD High income: OECD Low income
##   [  1, 39)                    4                18          0
##   [ 39, 77)                    5                10          1
##   [ 77,115)                    8                 1          9
##   [115,154)                    5                 1         16
##   [154,190]                    1                 0         11
##            
## GDP_group   Lower middle income Upper middle income
##   [  1, 39)                   5                  11
##   [ 39, 77)                  13                   9
##   [ 77,115)                  12                   8
##   [115,154)                   8                   8
##   [154,190]                  16                   9

Using dplyr

breaks <- quantile(mergedata[,Rank], probs = seq(0, 1, 0.2), na.rm = TRUE)
mergedata$quantileGDP <- cut(mergedata[,Rank], breaks = breaks)
mergedata[`Income Group` == "Lower middle income", .N, by = c("Income Group", "quantileGDP")]

Question 6

The American Community Survey distributes downloadable data about United States communities. Download the 2006 microdata survey about housing for the state of Idaho using download.file() from here:

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv

and load the data into R. The code book, describing the variable names is here:

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FPUMSDataDict06.pdf

Apply strsplit() to split all the names of the data frame on the characters “wgtp”. What is the value of the 123 element of the resulting list?

url<-"https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv"
download.file(url,paste0(getwd(),'/community.csv'),mode='wb')
com <-read.csv("community.csv")
splitlist=strsplit(names(com),"\\wgtp")
splitlist[[123]]

## [1] ""   "15"

Question 7

Load the Gross Domestic Product data for the 190 ranked countries in this data set:

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv

Remove the commas from the GDP numbers in millions of dollars and average them. What is the average?

Original data sources:

http://data.worldbank.org/data-catalog/GDP-ranking-table

url<-"https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv"
download.file(url,paste0(getwd(),'/worldbank.csv'),mode='wb')
worldbank<-read.csv("worldbank.csv",  
                    skip=4,
                    nrows = 190,
                    stringsAsFactors = FALSE)
worldbank$X.2=NULL
colnames(worldbank)<-c(X="CountryCode", X.1="Rank", X.3= "CountryName",X.4="GDP")
worldbank1<-as.numeric(gsub(",","",worldbank$GDP))
mean(worldbank1, na.rm=TRUE)

## [1] 377652.4

Question 8

In the data set from Question 2 what is a regular expression that would allow you to count the number of countries whose name begins with “United”? Assume that the variable with the country names in it is named countryNames. How many countries begin with United?

isUnited <- grepl("^United", worldbank$CountryName)
summary(isUnited)

##    Mode   FALSE    TRUE 
## logical     187       3

Question 9

Load the Gross Domestic Product data for the 190 ranked countries in this data set:

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv

Load the educational data from this data set:

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FEDSTATS_Country.csv

Match the data based on the country shortcode. Of the countries for which the end of the fiscal year is available, how many end in June?

Original data sources:

http://data.worldbank.org/data-catalog/GDP-ranking-table

http://data.worldbank.org/data-catalog/ed-stats

url<-"https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FGDP.csv"
url2<-"https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FEDSTATS_Country.csv"
download.file(url,paste0(getwd(),'/GDP.csv'),mode='wb')
download.file(url2,paste0(getwd(),'/EDU.csv'),mode='wb')
EDU<-read.csv("EDU.csv",
              stringsAsFactors = FALSE)
GDP<-read.csv("GDP.csv",
              stringsAsFactors = FALSE,
              na.strings = NULL,
              skip=4,
              colClasses=c(rep(NA,2),"NULL",rep(NA,2),rep("NULL",5)))
colnames(GDP)<-c(X="CountryCode", X.1="Rank",X.3="CountryName",X.4="GDP")
mdf<-merge(GDP,EDU,by="CountryCode",all=TRUE)
length(grep("[Ff]iscal [Yy]ear(.*)[Jj]une 30",mdf$Special.Notes))

## [1] 13

Question 10

You can use the quantmod (http://www.quantmod.com/) package to get historical stock prices for publicly traded companies on the NASDAQ and NYSE. Use the following code to download data on Amazon’s stock price and get the times the data was sampled.

#install.packages("quantmod")
library(quantmod)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## Loading required package: TTR

## Version 0.4-0 included new data defaults. See ?getSymbols.

## 
## Attaching package: 'quantmod'

## The following object is masked from 'package:Hmisc':
## 
##     Lag

amzn=getSymbols("AMZN",auto.assign=FALSE)

## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
## 
## This message is shown once per session and may be disabled by setting 
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.

sampleTimes=index(amzn)
date<-format(sampleTimes,"%Y %A")
length(grep("2012",date))

## [1] 250

length(grep("2012.Monday",date))

## [1] 47

Getting & Cleaning Data - Quiz 3-4

Question 1

Solution

Question 2

Question 3

Question 4

Question 5

Question 6

Question 7

Question 8

Question 9

Question 10