Main PCA - All Variables
#Create "Clean" dataset on the COUNTY level (547 obs)
#This dataset does NOT contain any COVID data
evict.clean <- na.omit(covid.eviction %>%
dplyr::select(-GEOID, -year, -County, -stateFIPS, -`low-flag`,
-imputed, -subbed, -evictions_greater_state,
-evictions_greater_US, -covidCases_greater_US,
-covidDeaths_greater_US, -US_mean_filing_rate,
-US_covid_cases_rate, -covidDeaths_greater_state,
-US_covid_death_rate, -covidCases_greater_state,
-county_covid_cases_rate, -county_covid_death_rate,
-state_covid_cases_rate, -state_covid_death_rate,
-covidDeaths, -covidCases))
#This dataset DOES contain COVID data (we'll return to it later)
evict.COVID.clean <- na.omit(covid.eviction %>%
dplyr::select(-GEOID, -year, -County, -stateFIPS, -`low-flag`,
-imputed, -subbed,
-evictions_greater_US,
-US_mean_filing_rate,
-US_covid_cases_rate,
-US_covid_death_rate,
-county_covid_cases_rate, -county_covid_death_rate,
-state_covid_cases_rate, -state_covid_death_rate,
-covidDeaths, -covidCases))
#Collapse EVICT dataset on the STATE level
evict.bystate <- evict.clean %>%
group_by(evict.clean$State) %>%
summarise_all(mean) %>%
dplyr::select(-State)
evict.bystate2 <- data.frame(evict.bystate, row.names = 1)
#Collapse EVICT COVID dataset on the STATE level
evict.covid.bystate <- evict.COVID.clean %>%
group_by(evict.COVID.clean$State) %>%
summarise_all(mean)
#evict.covid.bystate2 <- data.frame(evict.covid.bystate, row.names = 1)
#PCA on a STATE level
pca.state <- prcomp(evict.bystate2, scale = TRUE)
pca.state
## Standard deviations (1, .., p=21):
## [1] 2.802205e+00 1.937767e+00 1.594893e+00 1.441739e+00 1.197255e+00
## [6] 9.357001e-01 8.766515e-01 7.065048e-01 6.615112e-01 4.972058e-01
## [11] 4.479475e-01 3.501365e-01 2.502060e-01 2.145646e-01 2.022351e-01
## [16] 1.382946e-01 8.944997e-02 8.396634e-02 4.480825e-02 1.192441e-03
## [21] 7.183349e-05
##
## Rotation (n x k) = (21 x 21):
## PC1 PC2 PC3
## poverty.rate -0.03627044 0.193911629 0.47162638
## renter.occupied.households 0.31286351 0.008918725 -0.12107947
## pct.renter.occupied 0.29229955 -0.028127575 0.07232266
## median.gross.rent 0.33192045 -0.119537264 -0.10544043
## median.household.income 0.26090418 -0.139238737 -0.33258077
## median.property.value 0.31192854 -0.181698085 -0.09774482
## rent.burden 0.16541609 0.041182675 0.17601757
## pct.white -0.23176348 0.012112970 -0.38464628
## pct.af.am 0.15977122 0.331799290 0.26734263
## pct.hispanic 0.09228317 -0.122870730 0.05268475
## pct.am.ind -0.04830155 -0.040877160 0.08598101
## pct.asian 0.19625978 -0.355169983 0.22670207
## pct.nh.pi 0.09625013 -0.359363285 0.33648418
## pct.multiple 0.10396610 -0.353473493 0.32150176
## pct.other 0.24157757 -0.017610931 -0.20189492
## eviction.filings 0.22523913 0.243352390 -0.02980383
## evictions 0.28234628 0.170876118 -0.05841257
## eviction.rate 0.06389254 0.287988844 0.16955842
## eviction.filing.rate 0.19480574 0.326326207 0.04795626
## population 0.30849964 -0.019173081 -0.13984234
## state_mean_filing_rate 0.19432213 0.327690073 0.04662827
## PC4 PC5 PC6
## poverty.rate 0.24679181 0.196039595 -0.07495693
## renter.occupied.households 0.20479251 0.119056441 -0.12662985
## pct.renter.occupied 0.22046473 0.033979035 -0.31284598
## median.gross.rent -0.07153794 0.065866164 0.05951807
## median.household.income -0.11467315 -0.068595829 -0.04271553
## median.property.value -0.05235355 0.063276665 -0.09016968
## rent.burden -0.20833768 0.467729280 0.40619474
## pct.white -0.22938260 0.075017966 -0.16920103
## pct.af.am -0.02606450 0.165135809 -0.16167225
## pct.hispanic 0.44293336 -0.116178752 0.63821132
## pct.am.ind 0.46594826 -0.370657602 -0.16277508
## pct.asian -0.17017806 -0.043961271 0.01709534
## pct.nh.pi -0.21741434 -0.135622166 -0.09213628
## pct.multiple -0.20158500 -0.168227034 -0.13373447
## pct.other 0.05629288 0.200233425 0.10182440
## eviction.filings -0.10327628 -0.414998040 0.05308637
## evictions 0.17060301 0.005057141 -0.31495078
## eviction.rate -0.15198398 0.212383749 -0.12834565
## eviction.filing.rate -0.22946428 -0.324120693 0.16600787
## population 0.17229728 0.122815848 0.05133285
## state_mean_filing_rate -0.22868512 -0.323149399 0.16641140
## PC7 PC8 PC9
## poverty.rate -0.200018136 0.231812110 -0.017066154
## renter.occupied.households -0.137598284 -0.139524364 -0.257172040
## pct.renter.occupied -0.204275928 -0.047740187 0.080942609
## median.gross.rent 0.049561143 0.021371413 0.013072389
## median.household.income 0.081364991 -0.055090627 0.187953577
## median.property.value -0.020713300 0.052158050 0.007898211
## rent.burden -0.037272846 0.245508801 -0.458141889
## pct.white -0.027467451 0.105467169 -0.263375544
## pct.af.am -0.181671930 -0.027588810 0.355055174
## pct.hispanic 0.093567757 -0.340214621 0.150333530
## pct.am.ind 0.381376994 0.451762554 -0.308200640
## pct.asian 0.027989771 0.003046890 -0.047977312
## pct.nh.pi 0.022189844 -0.011269794 0.037887817
## pct.multiple 0.093588972 0.009391279 -0.054069974
## pct.other 0.227586050 0.612647730 0.469673453
## eviction.filings -0.312756343 0.069574836 -0.123668619
## evictions 0.110846205 -0.203649841 -0.098306102
## eviction.rate 0.721488602 -0.300498784 -0.029135833
## eviction.filing.rate 0.063320523 0.073667150 -0.027328039
## population -0.003164021 -0.057230234 -0.337524680
## state_mean_filing_rate 0.063044463 0.073709232 -0.028573556
## PC10 PC11 PC12 PC13
## poverty.rate 0.15853173 0.14037964 0.15332247 -0.48085524
## renter.occupied.households 0.02164894 0.21849746 0.17192459 0.03098548
## pct.renter.occupied 0.27633341 -0.44554579 0.32200672 0.46884435
## median.gross.rent -0.18525823 -0.20818268 -0.04307170 -0.17942237
## median.household.income -0.34612465 0.03154570 -0.10506453 -0.05466424
## median.property.value -0.05507582 -0.42640783 0.19148886 -0.54990399
## rent.burden -0.05918501 -0.26852455 -0.27963631 0.20515731
## pct.white 0.27989054 -0.02522701 0.10441093 -0.10140951
## pct.af.am -0.47035790 0.04727821 -0.11709409 0.11869945
## pct.hispanic 0.18717803 -0.10106537 -0.02922714 -0.06032156
## pct.am.ind -0.31529398 -0.15054181 -0.05037368 0.04078990
## pct.asian -0.06668359 0.26944206 0.18696575 0.03828586
## pct.nh.pi 0.13928162 0.11201297 -0.14096307 -0.19197445
## pct.multiple 0.15519241 0.03544589 -0.10532205 0.24963147
## pct.other 0.34387880 0.21469902 -0.03815981 0.09151555
## eviction.filings 0.20226084 -0.03262977 -0.18412192 -0.11441434
## evictions 0.26217061 0.08061384 -0.65710077 -0.08358866
## eviction.rate 0.11973604 -0.07163397 0.18977344 -0.06833581
## eviction.filing.rate 0.02899472 0.02235171 0.18446975 0.03411882
## population -0.10773270 0.50388325 0.22391232 0.05123312
## state_mean_filing_rate 0.02736023 0.02092365 0.18505536 0.03339604
## PC14 PC15 PC16
## poverty.rate -0.47555871 -0.098574188 0.03094468
## renter.occupied.households 0.06744942 0.111251190 -0.28185317
## pct.renter.occupied -0.06433967 -0.253828237 0.07619448
## median.gross.rent -0.09966354 0.114545953 0.65592000
## median.household.income -0.58629338 -0.316080874 -0.33410411
## median.property.value 0.22356521 0.246890929 -0.19341521
## rent.burden -0.03349325 -0.127143109 -0.14132710
## pct.white -0.10096608 -0.078763460 0.08113354
## pct.af.am 0.18903885 0.128705083 -0.04321656
## pct.hispanic -0.05352332 -0.004397341 -0.04382986
## pct.am.ind 0.06492537 -0.066365845 -0.04569019
## pct.asian 0.09121247 -0.408470668 -0.02149976
## pct.nh.pi 0.32040271 -0.292641459 0.04727099
## pct.multiple -0.38325528 0.624212077 -0.07551472
## pct.other 0.11248095 0.066724695 -0.07163111
## eviction.filings 0.13200676 0.056430937 -0.36187518
## evictions -0.02674420 -0.124690422 0.19763150
## eviction.rate 0.03233596 -0.009639362 -0.20467673
## eviction.filing.rate -0.07336191 -0.045944882 0.16249165
## population 0.08190346 0.163541651 0.15844810
## state_mean_filing_rate -0.07468657 -0.043296702 0.15859766
## PC17 PC18 PC19
## poverty.rate -0.056385346 0.008407050 -0.003290466
## renter.occupied.households 0.092075415 -0.676879477 -0.262583432
## pct.renter.occupied -0.075308187 0.171386559 -0.050537625
## median.gross.rent -0.435353744 -0.303006312 0.044698543
## median.household.income -0.056190646 0.120530756 -0.171719917
## median.property.value 0.347393975 0.198078474 0.099281562
## rent.burden 0.053364327 0.035827197 -0.062811922
## pct.white -0.023835239 0.001537587 -0.045784353
## pct.af.am 0.001420509 0.033336976 -0.025211347
## pct.hispanic 0.017446844 0.010306572 -0.033049138
## pct.am.ind -0.031458591 -0.033626145 -0.029010364
## pct.asian 0.094292951 -0.153546438 0.634957401
## pct.nh.pi -0.114704094 0.071251006 -0.608637253
## pct.multiple 0.050688792 0.016254273 0.019243796
## pct.other -0.022904919 -0.047371487 0.002082552
## eviction.filings -0.542706610 0.053282871 0.218174133
## evictions 0.313749599 0.050217546 0.160119654
## eviction.rate -0.287892373 0.001799867 0.042295150
## eviction.filing.rate 0.272378239 -0.034537049 -0.113749226
## population -0.120409287 0.569543779 -0.067722984
## state_mean_filing_rate 0.273687679 -0.032891014 -0.109518261
## PC20 PC21
## poverty.rate -0.0001739192 -5.490040e-05
## renter.occupied.households 0.0003336271 7.516048e-05
## pct.renter.occupied 0.0007947018 -3.878454e-05
## median.gross.rent 0.0017479039 2.014422e-04
## median.household.income -0.0005861530 -1.724724e-04
## median.property.value -0.0022863062 -5.730169e-05
## rent.burden -0.0007625800 -3.348582e-05
## pct.white 0.0062923171 -7.100854e-01
## pct.af.am 0.0042469140 -5.213305e-01
## pct.hispanic 0.0034518497 -3.917219e-01
## pct.am.ind 0.0010108039 -1.521266e-01
## pct.asian 0.0012088337 -1.578753e-01
## pct.nh.pi 0.0046025072 -6.533948e-02
## pct.multiple 0.0006203815 -1.345733e-01
## pct.other 0.0007253216 -4.895284e-03
## eviction.filings -0.0010028829 -1.984173e-05
## evictions 0.0003482971 6.256770e-05
## eviction.rate -0.0004230036 1.897596e-05
## eviction.filing.rate -0.7064151084 -6.201809e-03
## population -0.0003202347 -1.704483e-04
## state_mean_filing_rate 0.7077232395 6.195809e-03
#How much variation is each principal component describing?
pca.state$sdev^2
## [1] 7.852353e+00 3.754941e+00 2.543684e+00 2.078612e+00 1.433420e+00
## [6] 8.755346e-01 7.685178e-01 4.991490e-01 4.375971e-01 2.472136e-01
## [11] 2.006570e-01 1.225955e-01 6.260307e-02 4.603799e-02 4.089904e-02
## [16] 1.912540e-02 8.001296e-03 7.050347e-03 2.007779e-03 1.421915e-06
## [21] 5.160050e-09
#What about percent of variation?
pca.state$sdev^2/sum(pca.state$sdev^2)
## [1] 3.739216e-01 1.788067e-01 1.211278e-01 9.898151e-02 6.825811e-02
## [6] 4.169213e-02 3.659609e-02 2.376900e-02 2.083796e-02 1.177208e-02
## [11] 9.555094e-03 5.837883e-03 2.981098e-03 2.192285e-03 1.947573e-03
## [16] 9.107335e-04 3.810141e-04 3.357308e-04 9.560854e-05 6.771022e-08
## [21] 2.457167e-10
#Because our PCA is primarily composed of PCs 1 and 2, let's plot those onto a biplot
biplot(pca.state, scale = 0,
arrow.len = 0)
### Variation: Main PCA Our first PCA is a function of all 27 variables from the housing and evictions dataset. The first principal component, PC1, has a variance of 7.85, or approximately 37.39% of the model’s total variation. The second princial component, PC2, has a variance of 4.75, and accounts for approximately 12.11% of the model’s total variation. Because n = 27, some principal components contribute almost no variance to the model. Next I graph a biplot of PC1 and PC2. Clearly, projecting 27 dimensions into 2 does not make for a quality graphic! From what we are able to parse from this plot, most states appear to behave quite similarly. However we can see some clear outliers. DC appears to be leading on both the “evictions” and “eviction filing” index, closely followed by Maryland (MD). However, DC also is quite high on the “median household income” dimension, as well as the “rent burden” dimension. This may be due to the fact that while DC is only a city, it is treated equivalently to all other states in this analysis. Other states with major cities, such as New York (NYC) or Illinois (Chicago), also have rural and suburban areas that are included in state averages, DC is almost entirely urban.
In order to create a slightly more interpretable graphic, I create two variable subsets of data : “Housing & Evictions” and “Population Statistics: Race, Ethnicity, Poverty”. By performing PCA of these two subsets, I will be able to gain more insight on which variables in these subcategories create the most variance, and thus are most impactful.
Subsetted PCAs: Housing & Eviction and Population Statistics
#Let's create the "Housing & Evictions" subset
#I'm also going to focus on "rates" instead of "counts"
#I also drop "State Filing Rate" because its identical to eviction.filing.rate
#when we collapse on the State-level
housing.sub <- evict.bystate2 %>%
dplyr::select(pct.renter.occupied, median.gross.rent,
median.property.value, eviction.rate, eviction.filing.rate)
#PCA with HOUSING & EVICTIONS subsets
pca.housing <- prcomp(housing.sub, scale = TRUE)
#Let's look at the percentage variances of each of our 7 new "Housing" PCs
pca.housing$sdev^2/sum(pca.housing$sdev^2)
## [1] 0.550186658 0.274089555 0.099863068 0.068878306 0.006982412
#Let's graph a biplot with out 2 primary PCs: PC1 and PC2
biplot(pca.housing, scale = 0,
arrow.len = 0)
#Return to PCA output to see which variables drive the main PCs.
pca.housing
## Standard deviations (1, .., p=5):
## [1] 1.6585938 1.1706613 0.7066225 0.5868488 0.1868477
##
## Rotation (n x k) = (5 x 5):
## PC1 PC2 PC3 PC4
## pct.renter.occupied 0.5126549 0.1246095 -0.24647753 -0.8085559
## median.gross.rent 0.5732881 0.0953457 0.03335204 0.4395208
## median.property.value 0.5645345 0.2138619 -0.07706653 0.3394145
## eviction.rate 0.0863060 -0.7476249 -0.64440333 0.1335842
## eviction.filing.rate 0.2870154 -0.6088528 0.71898649 -0.1414633
## PC5
## pct.renter.occupied -0.08451955
## median.gross.rent -0.68407535
## median.property.value 0.71723252
## eviction.rate 0.02253587
## eviction.filing.rate 0.09983491
Variation & Analysis: Housing and Evictions PCA
After running PCA on the housing and evictions subset, I examine the two most impactful principal components. PC1 accounts for 55.02% of the model’s variance, and is mostly a function of “percent renter occupied” “median gross rent,” and “median property value.” PC2 describes 27.41% of the model’s variance, and is mainly a funtion of “eviction rate” and “eviction filing.” I then graph a biplot of PC1 and PC2.
From this biplot, we can see two clear clusters of variables. The dimensionality of “eviction” variables(eviction rate and eviction filing) is nearly perpendicular to the rest of the “housing” variables (percent renter occupied, mean gross rent, etc). This is somewhat surprising to me, as I would have assumed that variables such as median property value and median rent would possibly be correlated with eviction rates. For robustness, I re-run the “housing” PCA to include “median household income,” a variable otherwise included in the “population statistics” subset. “Median household income” clusters with the rest of of the rent / property value dimensions, running perpendicular to the “eviction” varaibles.
While this plot is still somewhat difficult to read, we again see DC as an outlier. Both DC and Hawaii (HI) are higher on the “housing” access- they both have higher meadian property value and higher median gross rent. They also appear to have higher percentages of renters.
Delaware and Maryland have comparable median costs of renting. However, they have notably higher rates of eviction filing and eviction rates, with DC and Virginia close behind. South Carolina is the furthest on the “eviction” axis.
Next, I conduct PCA with the “population statistics” subset
#PCA with POPULATION subsets
pop.sub <- evict.bystate2 %>%
dplyr::select(poverty.rate, median.household.income, pct.white, pct.af.am, pct.hispanic,
pct.am.ind, pct.asian, pct.nh.pi, pct.multiple, pct.other, population)
pca.pop <- prcomp(pop.sub, scale = TRUE)
#Let's look at the percentage variances
pca.pop$sdev^2/sum(pca.pop$sdev^2)
## [1] 3.320943e-01 2.080871e-01 1.978896e-01 1.336183e-01 5.615873e-02
## [6] 3.602707e-02 2.501731e-02 5.206918e-03 4.497380e-03 1.403138e-03
## [11] 6.937790e-10
#Let's graph a biplot with out 2 primary PCs: PC1 and PC2
biplot(pca.pop, scale = 0,
arrow.len = 0)
#Unlike our "housing" PCA, we do not have as clearly defined dimensional clusters
#Let's return to the initial Population PCA output to see which variables drive the main PCs.
pca.pop
## Standard deviations (1, .., p=11):
## [1] 1.911292e+00 1.512930e+00 1.475393e+00 1.212354e+00 7.859682e-01
## [6] 6.295219e-01 5.245859e-01 2.393242e-01 2.224212e-01 1.242357e-01
## [11] 8.735885e-05
##
## Rotation (n x k) = (11 x 11):
## PC1 PC2 PC3 PC4
## poverty.rate -0.08885636 0.47737504 -0.38808552 0.12567648
## median.household.income 0.33271606 -0.46952252 0.02750817 0.03679128
## pct.white -0.35543838 -0.26362114 0.39138443 0.04333396
## pct.af.am 0.04534129 0.11882276 -0.49214688 0.50043697
## pct.hispanic 0.19598569 0.04292771 -0.24072931 -0.57297663
## pct.am.ind -0.02026225 0.20176573 -0.11451159 -0.61716663
## pct.asian 0.48081273 0.12978169 0.19100438 0.07883918
## pct.nh.pi 0.38797602 0.30506801 0.30744289 0.09604678
## pct.multiple 0.38989303 0.29111535 0.30716458 0.05805093
## pct.other 0.26395119 -0.37444572 -0.24405760 0.02828610
## population 0.33286373 -0.30104319 -0.31186713 -0.04158548
## PC5 PC6 PC7 PC8
## poverty.rate 0.076796738 0.31607039 0.35648322 0.535112088
## median.household.income 0.109691856 -0.30832943 -0.22693472 0.634244942
## pct.white 0.107559928 0.17475081 0.28967314 0.108312020
## pct.af.am 0.083914158 -0.28506248 -0.30940475 -0.161073389
## pct.hispanic -0.620974695 0.09290939 -0.13199346 0.037379752
## pct.am.ind 0.696541009 -0.18940078 -0.09380170 0.009802344
## pct.asian -0.006170493 0.02467350 0.18637972 0.208000495
## pct.nh.pi 0.020876785 0.08835586 -0.04538770 0.102988154
## pct.multiple 0.114534829 0.02619172 -0.02901405 -0.357391233
## pct.other 0.276022924 0.76719907 -0.21089891 -0.137402411
## population 0.053846899 -0.22590536 0.73036969 -0.267843728
## PC9 PC10 PC11
## poverty.rate -0.279229617 0.024900248 6.843108e-05
## median.household.income -0.316694459 0.074683206 8.333609e-05
## pct.white -0.051132411 0.066866386 7.101220e-01
## pct.af.am 0.096071687 0.014192033 5.213159e-01
## pct.hispanic -0.070723464 0.044466178 3.917317e-01
## pct.am.ind 0.108126511 -0.009972579 1.521342e-01
## pct.asian 0.444766286 -0.638623223 1.580266e-01
## pct.nh.pi 0.323682822 0.722995528 6.526634e-02
## pct.multiple -0.699237108 -0.126910271 1.345437e-01
## pct.other 0.039796997 0.002725753 4.886822e-03
## population -0.009697701 0.200941221 1.226055e-05
AA.housing.sub <- evict.bystate2 %>%
dplyr::select(pct.renter.occupied, median.gross.rent,
median.property.value, eviction.rate, eviction.filing.rate, pct.af.am)
pca.AA.housing <- prcomp(AA.housing.sub, scale = TRUE)
biplot(pca.AA.housing, scale = 0, arrow.len = 0)
#Interestingly, pct.AA appears to move in the same direction of eviction rates.
Variation & Analysis: Population Statistics PCA
In the “Population Statitics” PCA, PC1 accounts for 33.21% of the model’s variance, and is mostly a function of mostly a function of precent asian, percent multi-racial, percent pacific islander, and population size. PC2 describes 20.81% of the model’s variance, and is mainly a funtion of percent “other” race, median household income, and poverty rate.
The biplot of PC1 and PC2 shows Hawaii (HI) to be a clear outlier, with its large Asian, Pacific Islander, and Mixed Race Population. Both DC and California (CA) appear to also be more racially diverse, while relatively low on the “poverty” dimension, which is lead by Mississippi (MS) and Arizona (AZ).
Notably, I was surprised to know that percent African American was not more important in each PC, as I know that social science literature traditionally is closely related to a variety of life outcome variables. To see whether the negligible effect of percent African American is due to its inclusion in the “Population Statistics” PCA, I add it to the “Housing” PCA and examine the results. Interestingly, Percent African American does appear correlated with eviction rates and evictions filing rates. However, principal components for which Percent African American plays a large role contribute minimal variance to the overall model.