Cary K. Jim 2022-02-01
## Data Source: U.S. Census Data <https://data.census.gov/cedsci/>
# Initial data processing for Census ACS Table B28005
# B28005 Table contains number of household, household with under 18, PC ownership
# and subscription type information
# Initial data cleaning was done in Excel where margin of errors are removed
# Column headings have been shorten for ease of reading
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
setwd("D://Documents//R//Dissertation")
# This data file has been exported from Census Data Portal at ZIP-code level
b28005_ZIP <- read.csv('./Datasets/Census/ACS_5Y2019.B28005_data_ZIP_code.csv',
header = T, stringsAsFactors = T)
head(b28005_ZIP)## GEO_ID Geographic_Area ZIP Total_Household_byZIP Home_Stu
## 1 8600000US00601 ZCTA5 601 17038 3540
## 2 8600000US00602 ZCTA5 602 37668 6878
## 3 8600000US00603 ZCTA5 603 45586 8961
## 4 8600000US00606 ZCTA5 606 6373 1183
## 5 8600000US00610 ZCTA5 610 26556 5001
## 6 8600000US00612 ZCTA5 612 58764 11819
## Stu_Has_computer Stu_Has_computer_With_dial.up
## 1 2797 0
## 2 5693 1071
## 3 8103 55
## 4 712 0
## 5 4388 1079
## 6 10491 10
## Stu_Has_computer_With_broadband Stu_Has_computer_No_Internet Stu_No.computer
## 1 2247 550 743
## 2 4052 570 1185
## 3 7172 876 858
## 4 517 195 471
## 5 2825 484 613
## 6 9331 1150 1328
str(b28005_ZIP)## 'data.frame': 33120 obs. of 10 variables:
## $ GEO_ID : Factor w/ 33120 levels "8600000US00601",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Geographic_Area : Factor w/ 1 level "ZCTA5": 1 1 1 1 1 1 1 1 1 1 ...
## $ ZIP : int 601 602 603 606 610 612 616 617 622 623 ...
## $ Total_Household_byZIP : int 17038 37668 45586 6373 26556 58764 9981 23735 6678 41664 ...
## $ Home_Stu : int 3540 6878 8961 1183 5001 11819 1357 4895 735 8221 ...
## $ Stu_Has_computer : int 2797 5693 8103 712 4388 10491 1170 4405 295 3435 ...
## $ Stu_Has_computer_With_dial.up : int 0 1071 55 0 1079 10 0 94 28 0 ...
## $ Stu_Has_computer_With_broadband: int 2247 4052 7172 517 2825 9331 1018 3475 228 3178 ...
## $ Stu_Has_computer_No_Internet : int 550 570 876 195 484 1150 152 836 39 257 ...
## $ Stu_No.computer : int 743 1185 858 471 613 1328 187 490 440 4786 ...
b28005_County <- read.csv('./Datasets/Census/ACSDT5Y2019.B28005.csv',
header = T, stringsAsFactors = T)
head(b28005_County)## GEO_ID COUNTY.ID COUNTY.NAME STATE Total_Household
## 1 0500000US01001 1001 Autauga County Alabama 54815
## 2 0500000US01003 1003 Baldwin County Alabama 209822
## 3 0500000US01005 1005 Barbour County Alabama 22475
## 4 0500000US01007 1007 Bibb County Alabama 20600
## 5 0500000US01009 1009 Blount County Alabama 57130
## 6 0500000US01011 1011 Bullock County Alabama 9811
## Total_household_w_children Home_hasPC Home_hasPC_andBroadband
## 1 13205 12917 11878
## 2 46133 45591 42506
## 3 5307 4622 3586
## 4 4586 4272 4108
## 5 13389 12730 11487
## 6 2128 1854 1162
## Home_withNoInternet Home_with_NoPC Percent_Home_w_Stu Percent_Stu_home_w_PC
## 1 1039 288 0.24 0.98
## 2 3074 542 0.22 0.99
## 3 1029 685 0.24 0.87
## 4 164 314 0.22 0.93
## 5 1144 659 0.23 0.95
## 6 692 274 0.22 0.87
## Percent_Stu_PC_and_Broadband
## 1 0.9
## 2 0.92
## 3 0.68
## 4 0.9
## 5 0.86
## 6 0.55
str(b28005_County)## 'data.frame': 3142 obs. of 13 variables:
## $ GEO_ID : Factor w/ 3142 levels "0500000US01001",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ COUNTY.ID : int 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 ...
## $ COUNTY.NAME : Factor w/ 1877 levels "Abbeville County",..: 83 90 101 150 165 226 236 249 297 319 ...
## $ STATE : Factor w/ 51 levels " Alabama"," Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Total_Household : int 54815 209822 22475 20600 57130 9811 19505 111484 33128 25628 ...
## $ Total_household_w_children : int 13205 46133 5307 4586 13389 2128 4455 24817 7065 5245 ...
## $ Home_hasPC : int 12917 45591 4622 4272 12730 1854 3684 24327 6477 5166 ...
## $ Home_hasPC_andBroadband : int 11878 42506 3586 4108 11487 1162 2969 21806 5846 4730 ...
## $ Home_withNoInternet : int 1039 3074 1029 164 1144 692 629 2432 544 433 ...
## $ Home_with_NoPC : int 288 542 685 314 659 274 771 490 588 79 ...
## $ Percent_Home_w_Stu : num 0.24 0.22 0.24 0.22 0.23 0.22 0.23 0.22 0.21 0.2 ...
## $ Percent_Stu_home_w_PC : Factor w/ 45 levels "#DIV/0!","0.46",..: 43 44 32 38 40 32 28 43 37 44 ...
## $ Percent_Stu_PC_and_Broadband: Factor w/ 64 levels "#DIV/0!","0.25",..: 54 56 32 54 50 19 31 52 47 54 ...
# Append general population count to this table by zip level
## First, I had to retrieve a population data file at ZIP level from Census
pop_acs <- read.csv("./Datasets/Census/ACS_population.csv", header = T, stringsAsFactors = T)
head(pop_acs)## GEO_ID Geographic_AREA Total_Pop_Est
## 1 8600000US00601 ZCTA5 00601 17113
## 2 8600000US00602 ZCTA5 00602 37751
## 3 8600000US00603 ZCTA5 00603 47081
## 4 8600000US00606 ZCTA5 00606 6392
## 5 8600000US00610 ZCTA5 00610 26686
## 6 8600000US00612 ZCTA5 00612 59369
str(pop_acs)## 'data.frame': 33120 obs. of 3 variables:
## $ GEO_ID : Factor w/ 33120 levels "8600000US00601",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Geographic_AREA: Factor w/ 33120 levels "ZCTA5 00601",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Total_Pop_Est : int 17113 37751 47081 6392 26686 59369 10022 23750 6741 41746 ...
# Similar to other datasets, ZIP has been read as integer
b28005_ZIP[3] <- lapply(b28005_ZIP[3], as.character)
b28005_ZIP[3] <- lapply(b28005_ZIP[3], as.factor)
str(b28005_ZIP) # There are 33120 levels in ZIP## 'data.frame': 33120 obs. of 10 variables:
## $ GEO_ID : Factor w/ 33120 levels "8600000US00601",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Geographic_Area : Factor w/ 1 level "ZCTA5": 1 1 1 1 1 1 1 1 1 1 ...
## $ ZIP : Factor w/ 33120 levels "10001","10002",..: 19476 19553 19565 19719 19865 19956 20179 20193 20411 20486 ...
## $ Total_Household_byZIP : int 17038 37668 45586 6373 26556 58764 9981 23735 6678 41664 ...
## $ Home_Stu : int 3540 6878 8961 1183 5001 11819 1357 4895 735 8221 ...
## $ Stu_Has_computer : int 2797 5693 8103 712 4388 10491 1170 4405 295 3435 ...
## $ Stu_Has_computer_With_dial.up : int 0 1071 55 0 1079 10 0 94 28 0 ...
## $ Stu_Has_computer_With_broadband: int 2247 4052 7172 517 2825 9331 1018 3475 228 3178 ...
## $ Stu_Has_computer_No_Internet : int 550 570 876 195 484 1150 152 836 39 257 ...
## $ Stu_No.computer : int 743 1185 858 471 613 1328 187 490 440 4786 ...
# Check data again
str(b28005_ZIP)## 'data.frame': 33120 obs. of 10 variables:
## $ GEO_ID : Factor w/ 33120 levels "8600000US00601",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Geographic_Area : Factor w/ 1 level "ZCTA5": 1 1 1 1 1 1 1 1 1 1 ...
## $ ZIP : Factor w/ 33120 levels "10001","10002",..: 19476 19553 19565 19719 19865 19956 20179 20193 20411 20486 ...
## $ Total_Household_byZIP : int 17038 37668 45586 6373 26556 58764 9981 23735 6678 41664 ...
## $ Home_Stu : int 3540 6878 8961 1183 5001 11819 1357 4895 735 8221 ...
## $ Stu_Has_computer : int 2797 5693 8103 712 4388 10491 1170 4405 295 3435 ...
## $ Stu_Has_computer_With_dial.up : int 0 1071 55 0 1079 10 0 94 28 0 ...
## $ Stu_Has_computer_With_broadband: int 2247 4052 7172 517 2825 9331 1018 3475 228 3178 ...
## $ Stu_Has_computer_No_Internet : int 550 570 876 195 484 1150 152 836 39 257 ...
## $ Stu_No.computer : int 743 1185 858 471 613 1328 187 490 440 4786 ...
# Check for missing values - incomplete cases (None returned)
b28005_ZIP[!complete.cases(b28005_ZIP),]## [1] GEO_ID Geographic_Area
## [3] ZIP Total_Household_byZIP
## [5] Home_Stu Stu_Has_computer
## [7] Stu_Has_computer_With_dial.up Stu_Has_computer_With_broadband
## [9] Stu_Has_computer_No_Internet Stu_No.computer
## <0 rows> (or 0-length row.names)
# Convert ZIP level data columns to percentages
## How many percent of household with children estimated to have computer?
b28005_ZIP$Percent_with_PC <- b28005_ZIP$Stu_Has_computer/b28005_ZIP$Home_Stu
## How many percent of household with children estimated to not have computer?
b28005_ZIP$Percent_no_PC <- (1 - b28005_ZIP$Percent_with_PC)
## How many percent of children at home with computer and internet (both broadband and dial-up)?
b28005_ZIP$With_Both_PC_internet <- (b28005_ZIP$Stu_Has_computer_With_dial.up + b28005_ZIP$Stu_Has_computer_With_broadband)/b28005_ZIP$Home_Stu
## How many percent of children at home with computer and broadband internet?
b28005_ZIP$With_PC_broadband <- b28005_ZIP$Stu_Has_computer_With_broadband/b28005_ZIP$Home_Stu
## How many percent of children at home with computer and dial-up service?
b28005_ZIP$With_PC_dialup <- b28005_ZIP$Stu_Has_computer_With_dial.up/b28005_ZIP$Home_Stu
## How many percent of children at home have PC but no internet?
b28005_ZIP$With_PC_noInternet <- b28005_ZIP$Stu_Has_computer_No_Internet/b28005_ZIP$Home_Stu
# View Column Names
colnames(b28005_ZIP)## [1] "GEO_ID" "Geographic_Area"
## [3] "ZIP" "Total_Household_byZIP"
## [5] "Home_Stu" "Stu_Has_computer"
## [7] "Stu_Has_computer_With_dial.up" "Stu_Has_computer_With_broadband"
## [9] "Stu_Has_computer_No_Internet" "Stu_No.computer"
## [11] "Percent_with_PC" "Percent_no_PC"
## [13] "With_Both_PC_internet" "With_PC_broadband"
## [15] "With_PC_dialup" "With_PC_noInternet"
## Drop the columns with raw estimated counts.
b28005_ZIP_c <- b28005_ZIP[, c(1:5, 11:16 )]
# Export this file first
# write.csv(b28005_ZIP_c, "./Export/b28005_zip_percent.csv", row.names = F)
# Join the two dataframes
ACS_internet <- left_join(b28005_ZIP_c, pop_acs, by = "GEO_ID")
#export the census file
#write.csv(ACS_internet, "./Export/b28005_mod.csv", row.names = F)