Quiz 1

2006 Microdata Survey data about United States communities

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv

variable names

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FPUMSDataDict06.pdf

setwd("C:/Users/krith/OneDrive/Documents/coursera/getting_cleaning_data/week1")
microdata_url <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv"
download.file(microdata_url,destfile = "./data/microdata_survey.csv",method = "curl")
list.files("./data/")

## [1] "balt_restaurants.xml" "cameras.csv"          "cameras.xlsx"        
## [4] "microdata_survey.csv" "ngap.xlsx"            "simple.xml"

microdata_survey <- read.csv("./data/microdata_survey.csv",sep = ",",header = T)
dim(microdata_survey)

## [1] 6496  188

str(microdata_survey$VAL)

##  int [1:6496] 17 NA 18 19 20 15 NA NA 13 1 ...

head(microdata_survey$VAL)

## [1] 17 NA 18 19 20 15

tail(microdata_survey$VAL)

## [1] NA NA 16  6 NA NA

# Code book indicates worth more than $1,000,000 -> variable, VAL is  24 

props <- length(microdata_survey$VAL[(microdata_survey$VAL>23) & (is.na(microdata_survey$VAL)==FALSE)])
sprintf("There are %i properties that are worth $1,000,000 or more",props)

## [1] "There are 53 properties that are worth $1,000,000 or more"

Excel spreadsheet on Natural Gas Aquisition Program

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FDATA.gov_NGAP.xlsx

library(openxlsx)

## Warning: package 'openxlsx' was built under R version 3.3.3

ng_url <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FDATA.gov_NGAP.xlsx"
download.file(ng_url,destfile = "./data/ngap.xlsx",method = "curl")
list.files("./data/")

## [1] "balt_restaurants.xml" "cameras.csv"          "cameras.xlsx"        
## [4] "microdata_survey.csv" "ngap.xlsx"            "simple.xml"

# Reading only rows from 18-23 & columns from 7-15
rowIndex <- 18:23 
colIndex<- 7:15 
dat <- read.xlsx("./data/ngap.xlsx", sheet = 1,cols = 7:15, rows = 18:23) 
dim(dat)

## [1] 5 9

dat

##     Zip CuCurrent PaCurrent PoCurrent      Contact Ext          Fax email
## 1 74136         0         1         0 918-491-6998   0 918-491-6659    NA
## 2 30329         1         0         0 404-321-5711  NA         <NA>    NA
## 3 74136         1         0         0 918-523-2516   0 918-523-2522    NA
## 4 80203         0         1         0 303-864-1919   0         <NA>    NA
## 5 80120         1         0         0 345-098-8890 456         <NA>    NA
##   Status
## 1      1
## 2      1
## 3      1
## 4      1
## 5      1

sum(dat$Zip*dat$Ext,na.rm=T)

## [1] 36534720

XML data on Baltimore restaurants

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml

library(XML)

## Warning: package 'XML' was built under R version 3.3.3

baltimore_url <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml"
download.file(baltimore_url,destfile = "./data/balt_restaurants.xml")

balt <- xmlTreeParse("./data/balt_restaurants.xml",useInternal = TRUE)

rootNode<-xmlRoot(balt)
xmlName(rootNode)

## [1] "response"

xmlChildren(rootNode[[1]][[1]])

## $name
## <name>410</name> 
## 
## $zipcode
## <zipcode>21206</zipcode> 
## 
## $neighborhood
## <neighborhood>Frankford</neighborhood> 
## 
## $councildistrict
## <councildistrict>2</councildistrict> 
## 
## $policedistrict
## <policedistrict>NORTHEASTERN</policedistrict> 
## 
## $location_1
## <location_1 human_address="{&quot;address&quot;:&quot;4509 BELAIR ROAD&quot;,&quot;city&quot;:&quot;Baltimore&quot;,&quot;state&quot;:&quot;MD&quot;,&quot;zip&quot;:&quot;&quot;}" needs_recoding="true"/> 
## 
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"

zipcode <- xpathSApply(rootNode,"//zipcode",xmlValue)
length(zipcode[zipcode==21231])

## [1] 127

2006 Microdata Survey data about United States communities

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06pid.csv

library(data.table)
fileUrl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06pid.csv"
download.file(fileUrl, destfile="./data/microdata_survey.csv")
DT <- fread("./data/microdata_survey.csv")

system.time(DT[,mean(pwgtp15),by=SEX])

##    user  system elapsed 
##       0       0       0

system.time(mean(DT[DT$SEX==1,]$pwgtp15))+system.time(mean(DT[DT$SEX==2,]$pwgtp15))

##    user  system elapsed 
##    0.00    0.02    0.01

system.time(sapply(split(DT$pwgtp15,DT$SEX),mean))

##    user  system elapsed 
##       0       0       0

system.time(mean(DT$pwgtp15,by=DT$SEX))

##    user  system elapsed 
##       0       0       0

system.time(tapply(DT$pwgtp15,DT$SEX,mean))

##    user  system elapsed 
##       0       0       0

#system.time(rowMeans(DT)[DT$SEX==1])+system.time(rowMeans(DT)[DT$SEX==2])