knitr::opts_chunk$set(echo = TRUE)
initialize <-function()
{
library(stringr)
library(tidyr)
library(dplyr)
library(plyr)
}
initialize()
getCensusDF <- function(fileName = "census_disability.csv")
{
##dfCensus <- read.csv(fileName, stringsAsFactors = FALSE)
dfCensus <- read.csv("census_disability.csv", stringsAsFactors = FALSE)
##remove top comment rows and bottom comment rows which appear after "independent living difficulty" row
##dfCensus <- slice(dfCensus, 6:which(str_trim(dfCensus[,1]) == "Independent living difficulty"))
dfCensus <- slice(dfCensus, 6:(which(str_trim(dfCensus[,1]) == "SEX") -2) )
dfCensus <- subset(dfCensus, select = c(-X,-X.1,-X.3, -X.4))
#colnames(dfCensus) <- seq(1:length(dfCensus)) #makes columns names numbers
dfCensus[1,2:length(dfCensus)] <- unlist(rep(dfCensus[1, seq(2, length(dfCensus), 6)], each = 6)) # file in blank header cells
dfCensus[2,2:length(dfCensus)] <- unlist(rep(dfCensus[2, seq(2, length(dfCensus), 2)], each = 2)) # fill in blank subheader cells
dfCensus[2,] <- str_c(dfCensus[2,], " ", dfCensus[3,]) #combine header/subheader
colnames(dfCensus) <- dfCensus[1,] #make header row into columnames
dfCensus <- dfCensus[c(-1,-3),] #remove rows that were header/subheader
return(subset(dfCensus, (str_trim(dfCensus[,2])!= ""))) # remove blank lines and return
}
getDataRotatedViaState <-function(df)
{
dfTranspose <- NULL
#loop thru to do one state (or us as whole) at a time
while (length(dfCensus) >=7)
{
dfOneLoc <- dfCensus[,1:7] # get df of one location
#insert location into df as col2
dfOneLoc <- cbind(Subject = dfOneLoc[,1], Location = rep(colnames(dfCensus[2]), nrow(dfCensus)), dfOneLoc[,2:7])
colnames(dfOneLoc) <- c(colnames(dfOneLoc[1:2]), dfOneLoc[1,3:8]) #rename some cols
dfOneLoc <- dfOneLoc[-1,] #remove header row that is now colnames
if (is.null(dfTranspose))
{
dfTranspose <- dfOneLoc
}
else #if 2+ loop, append
{
dfTranspose <- bind_rows(dfTranspose, dfOneLoc)
}
if(!length(dfCensus) <8) #if still another state left to process, chop off front that we just transposed
{
dfCensus <- select(dfCensus, Subject, 8:length(dfCensus))
}
else
{
dfCensus <- dfCensus[,-7] #drop a column so fall out of loop
}
}
return(dfTranspose)
}
createDFWithAgeCol <- function(dfTranspose, dfOrig)
{
#get list of ages, e.g., "population under 5 years
ageGroup <- unique(as.character(filter(dfTranspose, grepl("Population", Subject))[,1]))
#get list of disabilities e.g., with a hearing difficulty
disGroup <- c("Total", unique(as.character(filter(dfTranspose, grepl("With", Subject))[,1])))
#create df with setup with appropriate number of rows for an individual location
dfDis <- data.frame(Disability = rep(disGroup, length(ageGroup)), Age = rep(ageGroup, 1, each = length(disGroup)),
stringsAsFactors = FALSE)
cols <- colnames(dfTranspose)[2:length(dfTranspose)]
dfDis[, cols] <- " " ##leave spaces instead of nulls or NA's do to problem assigning values below
dfDis <- rbind(" ", dfDis)
dfDis[[1,1]] <- "Total for Location"
dfDis[[1,2]] <- "Total All Groups"
dfTemp <- NULL
locations <- unique(dfTranspose$Location) #i.e., the states and US
#loop for unique number of locations to create empty rows (mostly) for each location
for (i in 1:(length(unique(dfTranspose$Location))))
{
if (is.null(dfTemp))
{
dfDis$Location <- locations[i]
dfTemp <- dfDis
}
else
{
dfDis$Location <- locations[i]
dfTemp <- rbind(dfTemp, dfDis)
}
}
dfDis <- dfTemp
rm(dfTemp)
return(dfDis)
}
#combines mostly empty dfAge df, with partially tidy dftidy
createFinalTidy <-function(dfTidy, dfAge)
{
#populate total lines
dfAge[str_detect(dfAge$Age, "Total"),3:length(dfAge)] <- dfTidy[str_detect(dfTidy$Subject, "Total"),2:length(dfTidy)]
#populate all rows with actual data
for(loop in 2:nrow(dfTidy))
{
#if we are at a new population set the total line
if (!is.na(str_extract(dfTidy[loop,"Subject"], "^Population")))
{
ageGroup <- as.character(dfTidy[loop,"Subject"])
dfAge[dfAge$Disability == "Total"
& dfAge$Age == ageGroup
& dfAge$Location == dfTidy$Location[loop]
,3:9
] <- dfTidy[loop, 2:8]
}
else # set the values for other lines
{
dfAge[dfAge$Disability == as.character(dfTidy$Subject[loop])
& dfAge$Age == ageGroup
& dfAge$Location == as.character(dfTidy$Location[loop])
,3:9
] <- dfTidy[loop,2:8]
}
}
#"private" function to make all blanks or (x) to be NA
updateBlanksToNA <-function(col)
{
col[str_detect(col, "^ ")] <- NA
col[str_detect(col, "(X)")] <- NA
return(col)
}
dfAge <- data.frame(lapply(dfAge, function(x) {str_replace_all(x, ",*", "")}))
dfAge$Percent.with.a.disability.Estimate <-
unlist(lapply(dfAge$Percent.with.a.disability.Estimate, function(x) {str_replace(x, "%", "")} ))
dfAge$Total.Margin.of.Error <-
unlist(lapply(dfAge$Total.Margin.of.Error, function(x) {str_replace(x, "\\+/-", "")} ))
dfAge$Percent.with.a.disability.Margin.of.Error <-
unlist(lapply(dfAge$Percent.with.a.disability.Margin.of.Error, function(x) {str_replace(x, "\\+/-", "")} ))
dfAge$With.a.disability.Margin.of.Error <-
unlist(lapply(dfAge$With.a.disability.Margin.of.Error, function(x) {str_replace(x, "\\+/-", "")} ))
#update blanks function changed to matrix, so coerce into df
dfAge <- data.frame(apply(dfAge, 2, updateBlanksToNA),stringsAsFactors = FALSE)
return(dfAge)
}
read.csv("census_disability.csv")[6:20,1:10]
## S1810..DISABILITY.CHARACTERISTICS X X.1 X.2 X.3
## 6 Subject NA United States NA
## 7 NA Total NA
## 8 NA Estimate NA
## 9 Total civilian noninstitutionalized population NA 308,896,460 NA
## 10 NA NA
## 11 Population under 5 years NA 19,907,090 NA
## 12 With a hearing difficulty NA (X) NA
## 13 With a vision difficulty NA (X) NA
## 14 NA NA
## 15 Population 5 to 17 years NA 53,670,414 NA
## 16 With a hearing difficulty NA (X) NA
## 17 With a vision difficulty NA (X) NA
## 18 With a cognitive difficulty NA (X) NA
## 19 With an ambulatory difficulty NA (X) NA
## 20 With a self-care difficulty NA (X) NA
## X.4 X.5 X.6 X.7
## 6 NA
## 7 NA With a disability
## 8 NA Margin of Error Estimate Margin of Error
## 9 NA +/-13,901 37,633,020 +/-94,880
## 10 NA
## 11 NA +/-18,231 167,847 +/-7,660
## 12 NA (X) 111,183 +/-6,251
## 13 NA (X) 103,607 +/-6,935
## 14 NA
## 15 NA +/-24,933 2,850,468 +/-27,707
## 16 NA (X) 339,548 +/-9,887
## 17 NA (X) 430,869 +/-10,767
## 18 NA (X) 2,153,873 +/-24,643
## 19 NA (X) 344,654 +/-9,558
## 20 NA (X) 516,527 +/-13,262
## X.8
## 6
## 7 Percent with a disability
## 8 Estimate
## 9 12.2%
## 10
## 11 0.8%
## 12 0.6%
## 13 0.5%
## 14
## 15 5.3%
## 16 0.6%
## 17 0.8%
## 18 4.0%
## 19 0.6%
## 20 1.0%
dfCensus <- getCensusDF()
dfCensus[1:5,1:5]
## Subject United States
## 2 Total Estimate
## 4 Total civilian noninstitutionalized population 308,896,460
## 6 Population under 5 years 19,907,090
## 7 With a hearing difficulty (X)
## 8 With a vision difficulty (X)
## United States.1 United States.2
## 2 Total Margin of Error With a disability Estimate
## 4 +/-13,901 37,633,020
## 6 +/-18,231 167,847
## 7 (X) 111,183
## 8 (X) 103,607
## United States.3
## 2 With a disability Margin of Error
## 4 +/-94,880
## 6 +/-7,660
## 7 +/-6,251
## 8 +/-6,935
dfTidy <- getDataRotatedViaState(dfCensus)
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
dfTidy[1:5,1:5]
## Subject Location
## 1 Total civilian noninstitutionalized population United States
## 2 Population under 5 years United States
## 3 With a hearing difficulty United States
## 4 With a vision difficulty United States
## 5 Population 5 to 17 years United States
## Total Estimate Total Margin of Error With a disability Estimate
## 1 308,896,460 +/-13,901 37,633,020
## 2 19,907,090 +/-18,231 167,847
## 3 (X) (X) 111,183
## 4 (X) (X) 103,607
## 5 53,670,414 +/-24,933 2,850,468
dfAge <- createDFWithAgeCol(dfTidy, dfCensus)
dfAge[1:6,1:6]
## Disability Age Location
## 1 Total for Location Total All Groups United States
## 2 Total Population under 5 years United States
## 3 With a hearing difficulty Population under 5 years United States
## 4 With a vision difficulty Population under 5 years United States
## 5 With a cognitive difficulty Population under 5 years United States
## 6 With an ambulatory difficulty Population under 5 years United States
## Total Estimate Total Margin of Error With a disability Estimate
## 1
## 2
## 3
## 4
## 5
## 6
dfTidy <- createFinalTidy(dfTidy, dfAge)
dfTidy[1:3,]
## Disability Age Location
## 1 Total for Location Total All Groups United States
## 2 Total Population under 5 years United States
## 3 With a hearing difficulty Population under 5 years United States
## Total.Estimate Total.Margin.of.Error With.a.disability.Estimate
## 1 308896460 13901 37633020
## 2 19907090 18231 167847
## 3 <NA> <NA> 111183
## With.a.disability.Margin.of.Error Percent.with.a.disability.Estimate
## 1 94880 12.2
## 2 7660 0.8
## 3 6251 0.6
## Percent.with.a.disability.Margin.of.Error
## 1 0.1
## 2 0.1
## 3 0.1
dfTidy[1:33,1:3]
## Disability Age
## 1 Total for Location Total All Groups
## 2 Total Population under 5 years
## 3 With a hearing difficulty Population under 5 years
## 4 With a vision difficulty Population under 5 years
## 5 With a cognitive difficulty Population under 5 years
## 6 With an ambulatory difficulty Population under 5 years
## 7 With a self-care difficulty Population under 5 years
## 8 With an independent living difficulty Population under 5 years
## 9 Total Population 5 to 17 years
## 10 With a hearing difficulty Population 5 to 17 years
## 11 With a vision difficulty Population 5 to 17 years
## 12 With a cognitive difficulty Population 5 to 17 years
## 13 With an ambulatory difficulty Population 5 to 17 years
## 14 With a self-care difficulty Population 5 to 17 years
## 15 With an independent living difficulty Population 5 to 17 years
## 16 Total Population 18 to 64 years
## 17 With a hearing difficulty Population 18 to 64 years
## 18 With a vision difficulty Population 18 to 64 years
## 19 With a cognitive difficulty Population 18 to 64 years
## 20 With an ambulatory difficulty Population 18 to 64 years
## 21 With a self-care difficulty Population 18 to 64 years
## 22 With an independent living difficulty Population 18 to 64 years
## 23 Total Population 65 years and over
## 24 With a hearing difficulty Population 65 years and over
## 25 With a vision difficulty Population 65 years and over
## 26 With a cognitive difficulty Population 65 years and over
## 27 With an ambulatory difficulty Population 65 years and over
## 28 With a self-care difficulty Population 65 years and over
## 29 With an independent living difficulty Population 65 years and over
## 30 Total for Location Total All Groups
## 31 Total Population under 5 years
## 32 With a hearing difficulty Population under 5 years
## 33 With a vision difficulty Population under 5 years
## Location
## 1 United States
## 2 United States
## 3 United States
## 4 United States
## 5 United States
## 6 United States
## 7 United States
## 8 United States
## 9 United States
## 10 United States
## 11 United States
## 12 United States
## 13 United States
## 14 United States
## 15 United States
## 16 United States
## 17 United States
## 18 United States
## 19 United States
## 20 United States
## 21 United States
## 22 United States
## 23 United States
## 24 United States
## 25 United States
## 26 United States
## 27 United States
## 28 United States
## 29 United States
## 30 Connecticut
## 31 Connecticut
## 32 Connecticut
## 33 Connecticut
filter(dfTidy, dfTidy$Disability == "Total")
## Disability Age Location Total.Estimate
## 1 Total Population under 5 years United States 19907090
## 2 Total Population 5 to 17 years United States 53670414
## 3 Total Population 18 to 64 years United States 193478987
## 4 Total Population 65 years and over United States 41839969
## 5 Total Population under 5 years Connecticut 193070
## 6 Total Population 5 to 17 years Connecticut 598755
## 7 Total Population 18 to 64 years Connecticut 2233159
## 8 Total Population 65 years and over Connecticut 509635
## 9 Total Population under 5 years Massachusetts 363859
## 10 Total Population 5 to 17 years Massachusetts 1034113
## 11 Total Population 18 to 64 years Massachusetts 4246935
## 12 Total Population 65 years and over Massachusetts 921944
## 13 Total Population under 5 years New York 1163871
## 14 Total Population 5 to 17 years New York 3088950
## 15 Total Population 18 to 64 years New York 12402577
## 16 Total Population 65 years and over New York 2661330
## Total.Margin.of.Error With.a.disability.Estimate
## 1 18231 167847
## 2 24933 2850468
## 3 25377 19606506
## 4 18303 15008199
## 5 984 1406
## 6 1078 29839
## 7 2310 183789
## 8 1466 161584
## 9 988 2861
## 10 1199 61550
## 11 2651 377194
## 12 2148 310718
## 13 1967 10259
## 14 2444 145123
## 15 4328 1043603
## 16 2929 900473
## With.a.disability.Margin.of.Error Percent.with.a.disability.Estimate
## 1 7660 0.8
## 2 27707 5.3
## 3 67395 10.1
## 4 49325 35.9
## 5 670 0.7
## 6 2835 5.0
## 7 6689 8.2
## 8 4084 31.7
## 9 865 0.8
## 10 4549 6.0
## 11 10801 8.9
## 12 7384 33.7
## 13 1555 0.9
## 14 6492 4.7
## 15 16898 8.4
## 16 11218 33.8
## Percent.with.a.disability.Margin.of.Error
## 1 0.1
## 2 0.1
## 3 0.1
## 4 0.1
## 5 0.3
## 6 0.5
## 7 0.3
## 8 0.8
## 9 0.2
## 10 0.4
## 11 0.3
## 12 0.8
## 13 0.1
## 14 0.2
## 15 0.1
## 16 0.4
filter(dfTidy, as.numeric(dfTidy$Percent.with.a.disability.Estimate) > 20)
## Disability Age Location
## 1 Total Population 65 years and over United States
## 2 With an ambulatory difficulty Population 65 years and over United States
## 3 Total Population 65 years and over Connecticut
## 4 Total Population 65 years and over Massachusetts
## 5 With an ambulatory difficulty Population 65 years and over Massachusetts
## 6 Total Population 65 years and over New York
## 7 With an ambulatory difficulty Population 65 years and over New York
## Total.Estimate Total.Margin.of.Error With.a.disability.Estimate
## 1 41839969 18303 15008199
## 2 <NA> <NA> 9675415
## 3 509635 1466 161584
## 4 921944 2148 310718
## 5 <NA> <NA> 190794
## 6 2661330 2929 900473
## 7 <NA> <NA> 613547
## With.a.disability.Margin.of.Error Percent.with.a.disability.Estimate
## 1 49325 35.9
## 2 43948 23.1
## 3 4084 31.7
## 4 7384 33.7
## 5 6206 20.7
## 6 11218 33.8
## 7 9397 23.1
## Percent.with.a.disability.Margin.of.Error
## 1 0.1
## 2 0.1
## 3 0.8
## 4 0.8
## 5 0.7
## 6 0.4
## 7 0.3
dfTidy[which.max(dfTidy$Total.Estimate),]
## Disability Age Location Total.Estimate
## 1 Total for Location Total All Groups United States 308896460
## Total.Margin.of.Error With.a.disability.Estimate
## 1 13901 37633020
## With.a.disability.Margin.of.Error Percent.with.a.disability.Estimate
## 1 94880 12.2
## Percent.with.a.disability.Margin.of.Error
## 1 0.1
dfTidy[which.min(dfTidy$Total.Estimate),]
## Disability Age Location Total.Estimate
## 31 Total Population under 5 years Connecticut 193070
## Total.Margin.of.Error With.a.disability.Estimate
## 31 984 1406
## With.a.disability.Margin.of.Error Percent.with.a.disability.Estimate
## 31 670 0.7
## Percent.with.a.disability.Margin.of.Error
## 31 0.3
dfTidy[which.max(dfTidy$With.a.disability.Estimate),]
## Disability Age Location Total.Estimate
## 1 Total for Location Total All Groups United States 308896460
## Total.Margin.of.Error With.a.disability.Estimate
## 1 13901 37633020
## With.a.disability.Margin.of.Error Percent.with.a.disability.Estimate
## 1 94880 12.2
## Percent.with.a.disability.Margin.of.Error
## 1 0.1
dfTidy[which.min(dfTidy$Total.Estimate),]
## Disability Age Location Total.Estimate
## 31 Total Population under 5 years Connecticut 193070
## Total.Margin.of.Error With.a.disability.Estimate
## 31 984 1406
## With.a.disability.Margin.of.Error Percent.with.a.disability.Estimate
## 31 670 0.7
## Percent.with.a.disability.Margin.of.Error
## 31 0.3
dfTidyNoTot <- dfTidy[dfTidy$Age != "Total All Groups",]
summarise(dfTidyNoTot, avg = mean(as.numeric(dfTidyNoTot$Percent.with.a.disability.Estimate), na.rm = TRUE))
## avg
## 1 6.348913