data <- read.csv("Climate_Ready_Boston_Social_Vulnerability.csv") #read original data
Source: The data used is from Analyze Boston Datasets, click the link to see more information.
Only when population, total disabilities, number of medical illness, people with LEP and low_to_no income are all equal to 0, we treat them as missing data. Then we impute the missing values by the mean of their neighbors. As you can see that only observations 46, 66 and 68(Census tract level) are treated as missing data and their neighors are shown below in the imputation.
impute <- data%>%
select(GEOID10,POP100_RE,AREA_SQFT,TotDis,LEP,MedIllnes,Low_to_No,Name)
impute[46,2] <- impute[47,2]
impute[46,4] <- impute[47,4]
impute[46,5] <- impute[47,5]
impute[46,6] <- impute[47,6]
impute[46,7] <- impute[47,7]
neighb981201 <- impute%>%
filter(GEOID10 == 25025060400|GEOID10 == 25025061000|GEOID10 == 25025061200|GEOID10 == 25025061101|GEOID10 == 25025091001|GEOID10 == 25025090901)
impute[66,2] <- mean(neighb981201$POP100_RE)
impute[66,4] <- mean(neighb981201$TotDis)
impute[66,5] <- mean(neighb981201$LEP)
impute[66,6] <- mean(neighb981201$MedIllnes)
impute[66,7] <- mean(neighb981201$Low_to_No)
neighb981700 <- impute%>%
filter(GEOID10 == 25025020101|GEOID10 == 25025020302|GEOID10 == 25025070101|GEOID10 == 25025070200|GEOID10 == 25025070300|GEOID10 == 25025010702|GEOID10 == 25025010801)
impute[68,2] <- mean(neighb981700$POP100_RE)
impute[68,4] <- mean(neighb981700$TotDis)
impute[68,5] <- mean(neighb981700$LEP)
impute[68,6] <- mean(neighb981700$MedIllnes)
impute[68,7] <- mean(neighb981700$Low_to_No)
e.g Proportion of disabilities in \(i^{th}\) neighborhood = # Total disabilitis in \(i^{th}\) neighborhood\(\div\)Population in \(i^{th}\) neighborhood = \(\frac{TolDis_{i}}{POP100RE_{i}}\)
neighb <- aggregate(impute[,2:7],by = list(data$Name),FUN = sum)
colnames(neighb)[1] <- "neighborhood"
neighb$econ_proportion <- neighb$Low_to_No/neighb$POP100_RE
neighb$lep_proportion <- neighb$LEP/neighb$POP100_RE
neighb$dis_proportion <- neighb$TotDis/neighb$POP100_RE
neighb$medill_proportion <- neighb$MedIllnes/neighb$POP100_RE
(We give the calculated proportion of each indicator in Leather District to these three areas since Leather District is in the middle of them.)
#Econimic Status
econ_nb <- neighb[,c(1,8)]
missing <- data.frame(matrix(c("Beacon Hill", "Downtown", "Chinatown",NA,NA,NA), nrow = 3, ncol = 2, byrow = F))
colnames(missing) <- colnames(econ_nb)
econ_nb <- rbind(econ_nb,missing)
econ_nb[24,2] <- econ_nb[12,2]
econ_nb[25,2] <- econ_nb[12,2]
econ_nb[26,2] <- econ_nb[12,2]
econ_nb$econ_proportion <- as.numeric(econ_nb$econ_proportion)
econ_nb$score <- rescale(-(econ_nb$econ_proportion),to=c(1,10))
kable(arrange(econ_nb,desc(score)))
| neighborhood | econ_proportion | score |
|---|---|---|
| Longwood Medical Area | 0.0382637 | 10.000000 |
| South Boston Waterfront | 0.0979625 | 9.125040 |
| West Roxbury | 0.1578076 | 8.247936 |
| Hyde Park | 0.1617727 | 8.189823 |
| Roslindale | 0.1844915 | 7.856850 |
| Back Bay | 0.1859449 | 7.835549 |
| North End | 0.1869076 | 7.821439 |
| West End | 0.2060411 | 7.541013 |
| Leather District | 0.2123009 | 7.449268 |
| Beacon Hill | 0.2123009 | 7.449268 |
| Downtown | 0.2123009 | 7.449268 |
| Chinatown | 0.2123009 | 7.449268 |
| Fenway | 0.2461862 | 6.952638 |
| South Boston | 0.2488331 | 6.913844 |
| Charlestown | 0.2528743 | 6.854616 |
| South End | 0.2740993 | 6.543536 |
| Brighton | 0.2806103 | 6.448109 |
| Mattapan | 0.2879195 | 6.340984 |
| Jamaica Plain | 0.3073300 | 6.056499 |
| Dorchester | 0.3296159 | 5.729871 |
| East Boston | 0.3370736 | 5.620570 |
| Bay Village | 0.3621198 | 5.253486 |
| Mission Hill | 0.3928508 | 4.803085 |
| Roxbury | 0.4275947 | 4.293870 |
| Allston | 0.4346681 | 4.190201 |
| Harbor Islands | 0.6523364 | 1.000000 |
#Communication
lep_nb <- neighb[,c(1,9)]
missing <- data.frame(matrix(c("Beacon Hill", "Downtown", "Chinatown",NA,NA,NA), nrow = 3, ncol = 2, byrow = F))
colnames(missing) <- colnames(lep_nb)
lep_nb <- rbind(lep_nb,missing)
lep_nb[24,2] <- lep_nb[12,2]
lep_nb[25,2] <- lep_nb[12,2]
lep_nb[26,2] <- lep_nb[12,2]
lep_nb$lep_proportion <- as.numeric(lep_nb$lep_proportion)
lep_nb$score <- rescale(-(lep_nb$lep_proportion),to=c(1,10))
kable(arrange(lep_nb,desc(score)))
| neighborhood | lep_proportion | score |
|---|---|---|
| Longwood Medical Area | 0.0405266 | 10.000000 |
| South Boston Waterfront | 0.1722901 | 8.130247 |
| Fenway | 0.2644631 | 6.822293 |
| Hyde Park | 0.2955683 | 6.380903 |
| North End | 0.3126185 | 6.138957 |
| West Roxbury | 0.3160509 | 6.090251 |
| Back Bay | 0.3175061 | 6.069601 |
| West End | 0.3186925 | 6.052765 |
| Roslindale | 0.3194857 | 6.041510 |
| Leather District | 0.3498814 | 5.610188 |
| Beacon Hill | 0.3498814 | 5.610188 |
| Downtown | 0.3498814 | 5.610188 |
| Chinatown | 0.3498814 | 5.610188 |
| South Boston | 0.3573662 | 5.503977 |
| South End | 0.3605769 | 5.458417 |
| Charlestown | 0.3630391 | 5.423478 |
| Brighton | 0.3767449 | 5.228990 |
| Mattapan | 0.4109597 | 4.743474 |
| Jamaica Plain | 0.4110199 | 4.742620 |
| Dorchester | 0.4230204 | 4.572330 |
| East Boston | 0.4408496 | 4.319330 |
| Allston | 0.4536354 | 4.137896 |
| Mission Hill | 0.4604004 | 4.041899 |
| Roxbury | 0.5133784 | 3.290130 |
| Bay Village | 0.5198157 | 3.198784 |
| Harbor Islands | 0.6747664 | 1.000000 |
#Health
health_nb <- neighb[,c(1,10,11)]
missing <- data.frame(matrix(c("Beacon Hill", "Downtown", "Chinatown",NA,NA,NA,NA,NA,NA), nrow = 3, ncol = 3, byrow = F))
colnames(missing) <- colnames(health_nb)
health_nb <- rbind(health_nb,missing)
health_nb[24,2:3] <- health_nb[12,2:3]
health_nb[25,2:3] <- health_nb[12,2:3]
health_nb[26,2:3] <- health_nb[12,2:3]
health_nb$dis_proportion <- as.numeric(health_nb$dis_proportion)
health_nb$medill_proportion <- as.numeric(health_nb$medill_proportion)
kable(health_nb%>%
select(neighborhood,dis_proportion)%>%
arrange(desc(dis_proportion)))
| neighborhood | dis_proportion |
|---|---|
| Harbor Islands | 0.3345794 |
| Mattapan | 0.1693200 |
| Roxbury | 0.1620701 |
| Dorchester | 0.1394950 |
| Hyde Park | 0.1276455 |
| East Boston | 0.1251877 |
| Roslindale | 0.1194955 |
| South End | 0.1140458 |
| West Roxbury | 0.1124085 |
| Jamaica Plain | 0.1081631 |
| Bay Village | 0.1034101 |
| Mission Hill | 0.0981130 |
| South Boston | 0.0971133 |
| Charlestown | 0.0933755 |
| Brighton | 0.0904622 |
| North End | 0.0792654 |
| Leather District | 0.0774314 |
| Beacon Hill | 0.0774314 |
| Downtown | 0.0774314 |
| Chinatown | 0.0774314 |
| West End | 0.0688749 |
| Back Bay | 0.0660549 |
| South Boston Waterfront | 0.0581907 |
| Allston | 0.0526870 |
| Fenway | 0.0456924 |
| Longwood Medical Area | 0.0425838 |
kable(health_nb%>%
select(neighborhood,medill_proportion)%>%
arrange(desc(medill_proportion)))
| neighborhood | medill_proportion |
|---|---|
| Harbor Islands | 0.4308972 |
| North End | 0.4275983 |
| Back Bay | 0.4134125 |
| South Boston Waterfront | 0.4099315 |
| West End | 0.4097213 |
| Leather District | 0.4062335 |
| Beacon Hill | 0.4062335 |
| Downtown | 0.4062335 |
| Chinatown | 0.4062335 |
| South Boston | 0.4044573 |
| South End | 0.3965920 |
| Bay Village | 0.3956618 |
| West Roxbury | 0.3955603 |
| Roslindale | 0.3940977 |
| Brighton | 0.3935614 |
| Charlestown | 0.3930574 |
| Jamaica Plain | 0.3929036 |
| Hyde Park | 0.3866421 |
| Longwood Medical Area | 0.3752479 |
| Mission Hill | 0.3750855 |
| Mattapan | 0.3744939 |
| Allston | 0.3730045 |
| Fenway | 0.3714622 |
| East Boston | 0.3679612 |
| Roxbury | 0.3628631 |
| Dorchester | 0.3600302 |
#PCA
pr.out <- prcomp(health_nb[,2:3],scale = T)
pr.var <- pr.out$sdev^2
pve <- pr.var/sum(pr.var)
plot(pve, xlab = "Principal Component", ylab = "Proportion of Variance Explained", ylim = c(0,1), type = 'b')
Here we encounter a problem that PC1 seems not strong enough to explain disability proportion and medical illness proportion.
How to deal with problem in PCA? When we did PCA at census tract level, this situation didn’t exist. But after aggregating census tracts into neighborhoods, it’s untenable to use only pc1 to replace two variables in Health and Wellness.
Do the new results kind of make sense compared to the previous results? If they do, we can move on to solve PCA problem, if they don’t, what is the problem? To me, only imputation and rescaling can cause some issues to the results, so if they trouble, how can we improve the process of imputation and rescaling.