https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv
variable names
https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FPUMSDataDict06.pdf
setwd("C:/Users/krith/OneDrive/Documents/coursera/getting_cleaning_data/week1")
microdata_url <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06hid.csv"
download.file(microdata_url,destfile = "./data/microdata_survey.csv",method = "curl")
list.files("./data/")
## [1] "balt_restaurants.xml" "cameras.csv" "cameras.xlsx"
## [4] "microdata_survey.csv" "ngap.xlsx" "simple.xml"
microdata_survey <- read.csv("./data/microdata_survey.csv",sep = ",",header = T)
dim(microdata_survey)
## [1] 6496 188
str(microdata_survey$VAL)
## int [1:6496] 17 NA 18 19 20 15 NA NA 13 1 ...
head(microdata_survey$VAL)
## [1] 17 NA 18 19 20 15
tail(microdata_survey$VAL)
## [1] NA NA 16 6 NA NA
# Code book indicates worth more than $1,000,000 -> variable, VAL is 24
props <- length(microdata_survey$VAL[(microdata_survey$VAL>23) & (is.na(microdata_survey$VAL)==FALSE)])
sprintf("There are %i properties that are worth $1,000,000 or more",props)
## [1] "There are 53 properties that are worth $1,000,000 or more"
https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FDATA.gov_NGAP.xlsx
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 3.3.3
ng_url <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2FDATA.gov_NGAP.xlsx"
download.file(ng_url,destfile = "./data/ngap.xlsx",method = "curl")
list.files("./data/")
## [1] "balt_restaurants.xml" "cameras.csv" "cameras.xlsx"
## [4] "microdata_survey.csv" "ngap.xlsx" "simple.xml"
# Reading only rows from 18-23 & columns from 7-15
rowIndex <- 18:23
colIndex<- 7:15
dat <- read.xlsx("./data/ngap.xlsx", sheet = 1,cols = 7:15, rows = 18:23)
dim(dat)
## [1] 5 9
dat
## Zip CuCurrent PaCurrent PoCurrent Contact Ext Fax email
## 1 74136 0 1 0 918-491-6998 0 918-491-6659 NA
## 2 30329 1 0 0 404-321-5711 NA <NA> NA
## 3 74136 1 0 0 918-523-2516 0 918-523-2522 NA
## 4 80203 0 1 0 303-864-1919 0 <NA> NA
## 5 80120 1 0 0 345-098-8890 456 <NA> NA
## Status
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
sum(dat$Zip*dat$Ext,na.rm=T)
## [1] 36534720
https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml
library(XML)
## Warning: package 'XML' was built under R version 3.3.3
baltimore_url <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml"
download.file(baltimore_url,destfile = "./data/balt_restaurants.xml")
balt <- xmlTreeParse("./data/balt_restaurants.xml",useInternal = TRUE)
rootNode<-xmlRoot(balt)
xmlName(rootNode)
## [1] "response"
xmlChildren(rootNode[[1]][[1]])
## $name
## <name>410</name>
##
## $zipcode
## <zipcode>21206</zipcode>
##
## $neighborhood
## <neighborhood>Frankford</neighborhood>
##
## $councildistrict
## <councildistrict>2</councildistrict>
##
## $policedistrict
## <policedistrict>NORTHEASTERN</policedistrict>
##
## $location_1
## <location_1 human_address="{"address":"4509 BELAIR ROAD","city":"Baltimore","state":"MD","zip":""}" needs_recoding="true"/>
##
## attr(,"class")
## [1] "XMLInternalNodeList" "XMLNodeList"
zipcode <- xpathSApply(rootNode,"//zipcode",xmlValue)
length(zipcode[zipcode==21231])
## [1] 127
https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06pid.csv
library(data.table)
fileUrl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Fss06pid.csv"
download.file(fileUrl, destfile="./data/microdata_survey.csv")
DT <- fread("./data/microdata_survey.csv")
system.time(DT[,mean(pwgtp15),by=SEX])
## user system elapsed
## 0 0 0
system.time(mean(DT[DT$SEX==1,]$pwgtp15))+system.time(mean(DT[DT$SEX==2,]$pwgtp15))
## user system elapsed
## 0.00 0.02 0.01
system.time(sapply(split(DT$pwgtp15,DT$SEX),mean))
## user system elapsed
## 0 0 0
system.time(mean(DT$pwgtp15,by=DT$SEX))
## user system elapsed
## 0 0 0
system.time(tapply(DT$pwgtp15,DT$SEX,mean))
## user system elapsed
## 0 0 0
#system.time(rowMeans(DT)[DT$SEX==1])+system.time(rowMeans(DT)[DT$SEX==2])