Quick Review:

Step 1: Read data into r

data <- read.csv("Climate_Ready_Boston_Social_Vulnerability.csv")  #read original data

Source: The data used is from Analyze Boston Datasets, click the link to see more information.

Description on dataset

Step 2: Imputation

Only when population, total disabilities, number of medical illness, people with LEP and low_to_no income are all equal to 0, we treat them as missing data. Then we impute the missing values by the mean of their neighbors. As you can see that only observations 46, 66 and 68(Census tract level) are treated as missing data and their neighors are shown below in the imputation.

impute <- data%>%
  select(GEOID10,POP100_RE,AREA_SQFT,TotDis,LEP,MedIllnes,Low_to_No,Name)
impute[46,2] <- impute[47,2]
impute[46,4] <- impute[47,4]
impute[46,5] <- impute[47,5]
impute[46,6] <- impute[47,6]
impute[46,7] <- impute[47,7]

neighb981201 <- impute%>%
  filter(GEOID10 == 25025060400|GEOID10 == 25025061000|GEOID10 == 25025061200|GEOID10 == 25025061101|GEOID10 == 25025091001|GEOID10 == 25025090901)
impute[66,2] <- mean(neighb981201$POP100_RE)
impute[66,4] <- mean(neighb981201$TotDis)
impute[66,5] <- mean(neighb981201$LEP)
impute[66,6] <- mean(neighb981201$MedIllnes)
impute[66,7] <- mean(neighb981201$Low_to_No)

neighb981700 <- impute%>%
  filter(GEOID10 == 25025020101|GEOID10 == 25025020302|GEOID10 == 25025070101|GEOID10 == 25025070200|GEOID10 == 25025070300|GEOID10 == 25025010702|GEOID10 == 25025010801)
impute[68,2] <- mean(neighb981700$POP100_RE)
impute[68,4] <- mean(neighb981700$TotDis)
impute[68,5] <- mean(neighb981700$LEP)
impute[68,6] <- mean(neighb981700$MedIllnes)
impute[68,7] <- mean(neighb981700$Low_to_No)

Step 3: Aggregate the population within each neighborhood in each indicator then calculate the proportion

e.g Proportion of disabilities in \(i^{th}\) neighborhood = # Total disabilitis in \(i^{th}\) neighborhood\(\div\)Population in \(i^{th}\) neighborhood = \(\frac{TolDis_{i}}{POP100RE_{i}}\)

neighb <- aggregate(impute[,2:7],by = list(data$Name),FUN = sum)
colnames(neighb)[1] <- "neighborhood"
neighb$econ_proportion <- neighb$Low_to_No/neighb$POP100_RE
neighb$lep_proportion <- neighb$LEP/neighb$POP100_RE
neighb$dis_proportion <- neighb$TotDis/neighb$POP100_RE
neighb$medill_proportion <- neighb$MedIllnes/neighb$POP100_RE

Step 4: Extract each indicator, imputing again in order to give Beacon Hill, Downtown and Chinatown estimated value since they are not in the neighborhood list but necessary for mapping.

(We give the calculated proportion of each indicator in Leather District to these three areas since Leather District is in the middle of them.)

#Econimic Status
econ_nb <- neighb[,c(1,8)]
missing <- data.frame(matrix(c("Beacon Hill", "Downtown", "Chinatown",NA,NA,NA), nrow = 3, ncol = 2, byrow  = F))
colnames(missing) <- colnames(econ_nb)

econ_nb <- rbind(econ_nb,missing)

econ_nb[24,2] <- econ_nb[12,2]
econ_nb[25,2] <- econ_nb[12,2]
econ_nb[26,2] <- econ_nb[12,2]
econ_nb$econ_proportion <- as.numeric(econ_nb$econ_proportion)

econ_nb$score <- rescale(-(econ_nb$econ_proportion),to=c(1,10)) 

kable(arrange(econ_nb,desc(score)))
neighborhood econ_proportion score
Longwood Medical Area 0.0382637 10.000000
South Boston Waterfront 0.0979625 9.125040
West Roxbury 0.1578076 8.247936
Hyde Park 0.1617727 8.189823
Roslindale 0.1844915 7.856850
Back Bay 0.1859449 7.835549
North End 0.1869076 7.821439
West End 0.2060411 7.541013
Leather District 0.2123009 7.449268
Beacon Hill 0.2123009 7.449268
Downtown 0.2123009 7.449268
Chinatown 0.2123009 7.449268
Fenway 0.2461862 6.952638
South Boston 0.2488331 6.913844
Charlestown 0.2528743 6.854616
South End 0.2740993 6.543536
Brighton 0.2806103 6.448109
Mattapan 0.2879195 6.340984
Jamaica Plain 0.3073300 6.056499
Dorchester 0.3296159 5.729871
East Boston 0.3370736 5.620570
Bay Village 0.3621198 5.253486
Mission Hill 0.3928508 4.803085
Roxbury 0.4275947 4.293870
Allston 0.4346681 4.190201
Harbor Islands 0.6523364 1.000000
#Communication
lep_nb <- neighb[,c(1,9)]
missing <- data.frame(matrix(c("Beacon Hill", "Downtown", "Chinatown",NA,NA,NA), nrow = 3, ncol = 2, byrow  = F))
colnames(missing) <- colnames(lep_nb)

lep_nb <- rbind(lep_nb,missing)

lep_nb[24,2] <- lep_nb[12,2]
lep_nb[25,2] <- lep_nb[12,2]
lep_nb[26,2] <- lep_nb[12,2]
lep_nb$lep_proportion <- as.numeric(lep_nb$lep_proportion)

lep_nb$score <- rescale(-(lep_nb$lep_proportion),to=c(1,10))

kable(arrange(lep_nb,desc(score)))
neighborhood lep_proportion score
Longwood Medical Area 0.0405266 10.000000
South Boston Waterfront 0.1722901 8.130247
Fenway 0.2644631 6.822293
Hyde Park 0.2955683 6.380903
North End 0.3126185 6.138957
West Roxbury 0.3160509 6.090251
Back Bay 0.3175061 6.069601
West End 0.3186925 6.052765
Roslindale 0.3194857 6.041510
Leather District 0.3498814 5.610188
Beacon Hill 0.3498814 5.610188
Downtown 0.3498814 5.610188
Chinatown 0.3498814 5.610188
South Boston 0.3573662 5.503977
South End 0.3605769 5.458417
Charlestown 0.3630391 5.423478
Brighton 0.3767449 5.228990
Mattapan 0.4109597 4.743474
Jamaica Plain 0.4110199 4.742620
Dorchester 0.4230204 4.572330
East Boston 0.4408496 4.319330
Allston 0.4536354 4.137896
Mission Hill 0.4604004 4.041899
Roxbury 0.5133784 3.290130
Bay Village 0.5198157 3.198784
Harbor Islands 0.6747664 1.000000
#Health
health_nb <- neighb[,c(1,10,11)]
missing <- data.frame(matrix(c("Beacon Hill", "Downtown", "Chinatown",NA,NA,NA,NA,NA,NA), nrow = 3, ncol = 3, byrow  = F))
colnames(missing) <- colnames(health_nb)

health_nb <- rbind(health_nb,missing)

health_nb[24,2:3] <- health_nb[12,2:3]
health_nb[25,2:3] <- health_nb[12,2:3]
health_nb[26,2:3] <- health_nb[12,2:3]
health_nb$dis_proportion <- as.numeric(health_nb$dis_proportion)
health_nb$medill_proportion <- as.numeric(health_nb$medill_proportion)

kable(health_nb%>%
  select(neighborhood,dis_proportion)%>%
  arrange(desc(dis_proportion)))
neighborhood dis_proportion
Harbor Islands 0.3345794
Mattapan 0.1693200
Roxbury 0.1620701
Dorchester 0.1394950
Hyde Park 0.1276455
East Boston 0.1251877
Roslindale 0.1194955
South End 0.1140458
West Roxbury 0.1124085
Jamaica Plain 0.1081631
Bay Village 0.1034101
Mission Hill 0.0981130
South Boston 0.0971133
Charlestown 0.0933755
Brighton 0.0904622
North End 0.0792654
Leather District 0.0774314
Beacon Hill 0.0774314
Downtown 0.0774314
Chinatown 0.0774314
West End 0.0688749
Back Bay 0.0660549
South Boston Waterfront 0.0581907
Allston 0.0526870
Fenway 0.0456924
Longwood Medical Area 0.0425838
kable(health_nb%>%
  select(neighborhood,medill_proportion)%>%
  arrange(desc(medill_proportion)))
neighborhood medill_proportion
Harbor Islands 0.4308972
North End 0.4275983
Back Bay 0.4134125
South Boston Waterfront 0.4099315
West End 0.4097213
Leather District 0.4062335
Beacon Hill 0.4062335
Downtown 0.4062335
Chinatown 0.4062335
South Boston 0.4044573
South End 0.3965920
Bay Village 0.3956618
West Roxbury 0.3955603
Roslindale 0.3940977
Brighton 0.3935614
Charlestown 0.3930574
Jamaica Plain 0.3929036
Hyde Park 0.3866421
Longwood Medical Area 0.3752479
Mission Hill 0.3750855
Mattapan 0.3744939
Allston 0.3730045
Fenway 0.3714622
East Boston 0.3679612
Roxbury 0.3628631
Dorchester 0.3600302

Step 5: PCA

#PCA
pr.out <- prcomp(health_nb[,2:3],scale = T)
pr.var <- pr.out$sdev^2
pve <- pr.var/sum(pr.var)
plot(pve, xlab = "Principal Component", ylab = "Proportion of Variance Explained", ylim = c(0,1), type = 'b')

Here we encounter a problem that PC1 seems not strong enough to explain disability proportion and medical illness proportion.

Question:

  1. How to deal with problem in PCA? When we did PCA at census tract level, this situation didn’t exist. But after aggregating census tracts into neighborhoods, it’s untenable to use only pc1 to replace two variables in Health and Wellness.

  2. Do the new results kind of make sense compared to the previous results? If they do, we can move on to solve PCA problem, if they don’t, what is the problem? To me, only imputation and rescaling can cause some issues to the results, so if they trouble, how can we improve the process of imputation and rescaling.