The focus of the research will be on how robbery in different communities in the United States could be explained by some of the societal factors such as ethnicity, income, age, population etc. and using an unsupervised learning algorithm to rank or group communities (into high, medium or low) by their susceptibility to robberies.
library(readr)
crimedata <- read_csv("C:/Users/shreejit/Downloads/crimedata.csv")
cd <- crimedata
cd[cd=="?"] <- NA
# Predictor Variables and the dependant Variable
myvars2 <- c (
"robbbPerPop",
"racePctWhite",
"racepctblack",
"racePctAsian",
"racePctHisp",
"agePct12t21",
"agePct12t29",
"agePct16t24",
"agePct65up",
"medIncome",
"pctWWage",
"pctWFarmSelf",
"pctWInvInc",
"pctWSocSec",
"pctWPubAsst",
"pctWRetire",
"medFamInc",
"perCapInc",
"population",
"householdsize",
"MedRent",
"MedRentPctHousInc",
"MedOwnCostPctInc",
"MedOwnCostPctIncNoMtg",
"NumInShelters",
"NumStreet"
)
cd2 <- cd[myvars2]
#converting dependent variable from Character to numeric
cd2$robbbPerPop <- as.numeric(as.character(cd2$robbbPerPop))
cd2 <- na.omit(cd2)
str(cd2)
## Classes 'tbl_df', 'tbl' and 'data.frame': 2214 obs. of 26 variables:
## $ robbbPerPop : num 8.2 21.3 154.9 57.9 32 ...
## $ racePctWhite : num 91.8 95.6 94.3 97.3 89.2 ...
## $ racepctblack : num 1.37 0.8 0.74 1.7 0.53 ...
## $ racePctAsian : num 6.5 3.44 3.43 0.5 1.17 0.9 1.47 0.4 1.25 0.92 ...
## $ racePctHisp : num 1.88 0.85 2.35 0.7 0.52 ...
## $ agePct12t21 : num 12.5 11 11.4 12.6 24.5 ...
## $ agePct12t29 : num 21.4 21.3 25.9 25.2 40.5 ...
## $ agePct16t24 : num 10.9 10.5 11 12.2 28.7 ...
## $ agePct65up : num 11.3 17.2 10.3 17.6 12.6 ...
## $ medIncome : int 75122 47917 35669 20580 17390 21577 42805 23221 25326 17852 ...
## $ pctWWage : num 89.2 79 82 68.2 69.3 ...
## $ pctWFarmSelf : num 1.55 1.11 1.15 0.24 0.55 1 0.39 0.67 2.93 0.86 ...
## $ pctWInvInc : num 70.2 64.1 55.7 39 42.8 ...
## $ pctWSocSec : num 23.6 35.5 22.2 39.5 32.2 ...
## $ pctWPubAsst : num 1.03 2.75 2.94 11.71 11.21 ...
## $ pctWRetire : num 18.4 22.9 14.6 18.3 14.4 ...
## $ medFamInc : int 79584 55323 42112 26501 24018 27705 50394 28901 34269 24058 ...
## $ perCapInc : int 29711 20148 16946 10810 8483 11878 18193 12161 13554 10195 ...
## $ population : int 11980 23123 29344 16656 11245 140494 28700 59459 74111 103590 ...
## $ householdsize : num 3.1 2.82 2.43 2.4 2.76 2.45 2.6 2.45 2.46 2.62 ...
## $ MedRent : int 1001 627 484 333 332 340 736 338 355 353 ...
## $ MedRentPctHousInc : num 23.8 27.6 24.1 28.7 32.2 26.4 24.4 26.3 25.2 29.6 ...
## $ MedOwnCostPctInc : num 21.1 20.7 21.7 20.6 23.2 17.3 20.8 15.1 20.7 19.4 ...
## $ MedOwnCostPctIncNoMtg: num 14 12.5 11.6 14.5 12.9 11.7 12.5 12.2 12.8 13 ...
## $ NumInShelters : int 11 0 16 0 2 327 0 21 125 43 ...
## $ NumStreet : int 0 0 0 0 0 4 0 0 15 4 ...
## - attr(*, "na.action")=Class 'omit' Named int 819
## .. ..- attr(*, "names")= chr "819"
# All predictor variables for PCA
myvars <- c (
"racePctWhite",
"racepctblack",
"racePctAsian",
"racePctHisp",
"agePct12t21",
"agePct12t29",
"agePct16t24",
"agePct65up",
"medIncome",
"pctWWage",
"pctWFarmSelf",
"pctWInvInc",
"pctWSocSec",
"pctWPubAsst",
"pctWRetire",
"medFamInc",
"perCapInc",
"population",
"householdsize",
"MedRent",
"MedRentPctHousInc",
"MedOwnCostPctInc",
"MedOwnCostPctIncNoMtg",
"NumInShelters",
"NumStreet"
)
newdata <- cd2[myvars]
# Getting rid of all rows with Null values
my_data <- na.omit(newdata)
#check available variables
colnames(my_data)
## [1] "racePctWhite" "racepctblack"
## [3] "racePctAsian" "racePctHisp"
## [5] "agePct12t21" "agePct12t29"
## [7] "agePct16t24" "agePct65up"
## [9] "medIncome" "pctWWage"
## [11] "pctWFarmSelf" "pctWInvInc"
## [13] "pctWSocSec" "pctWPubAsst"
## [15] "pctWRetire" "medFamInc"
## [17] "perCapInc" "population"
## [19] "householdsize" "MedRent"
## [21] "MedRentPctHousInc" "MedOwnCostPctInc"
## [23] "MedOwnCostPctIncNoMtg" "NumInShelters"
## [25] "NumStreet"
#check variable class
str(my_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 2214 obs. of 25 variables:
## $ racePctWhite : num 91.8 95.6 94.3 97.3 89.2 ...
## $ racepctblack : num 1.37 0.8 0.74 1.7 0.53 ...
## $ racePctAsian : num 6.5 3.44 3.43 0.5 1.17 0.9 1.47 0.4 1.25 0.92 ...
## $ racePctHisp : num 1.88 0.85 2.35 0.7 0.52 ...
## $ agePct12t21 : num 12.5 11 11.4 12.6 24.5 ...
## $ agePct12t29 : num 21.4 21.3 25.9 25.2 40.5 ...
## $ agePct16t24 : num 10.9 10.5 11 12.2 28.7 ...
## $ agePct65up : num 11.3 17.2 10.3 17.6 12.6 ...
## $ medIncome : int 75122 47917 35669 20580 17390 21577 42805 23221 25326 17852 ...
## $ pctWWage : num 89.2 79 82 68.2 69.3 ...
## $ pctWFarmSelf : num 1.55 1.11 1.15 0.24 0.55 1 0.39 0.67 2.93 0.86 ...
## $ pctWInvInc : num 70.2 64.1 55.7 39 42.8 ...
## $ pctWSocSec : num 23.6 35.5 22.2 39.5 32.2 ...
## $ pctWPubAsst : num 1.03 2.75 2.94 11.71 11.21 ...
## $ pctWRetire : num 18.4 22.9 14.6 18.3 14.4 ...
## $ medFamInc : int 79584 55323 42112 26501 24018 27705 50394 28901 34269 24058 ...
## $ perCapInc : int 29711 20148 16946 10810 8483 11878 18193 12161 13554 10195 ...
## $ population : int 11980 23123 29344 16656 11245 140494 28700 59459 74111 103590 ...
## $ householdsize : num 3.1 2.82 2.43 2.4 2.76 2.45 2.6 2.45 2.46 2.62 ...
## $ MedRent : int 1001 627 484 333 332 340 736 338 355 353 ...
## $ MedRentPctHousInc : num 23.8 27.6 24.1 28.7 32.2 26.4 24.4 26.3 25.2 29.6 ...
## $ MedOwnCostPctInc : num 21.1 20.7 21.7 20.6 23.2 17.3 20.8 15.1 20.7 19.4 ...
## $ MedOwnCostPctIncNoMtg: num 14 12.5 11.6 14.5 12.9 11.7 12.5 12.2 12.8 13 ...
## $ NumInShelters : int 11 0 16 0 2 327 0 21 125 43 ...
## $ NumStreet : int 0 0 0 0 0 4 0 0 15 4 ...
#principal component analysis
prin_comp <- prcomp(my_data, scale. = T)
names(prin_comp)
## [1] "sdev" "rotation" "center" "scale" "x"
#outputs the mean of variables
prin_comp$center
## racePctWhite racepctblack racePctAsian
## 8.399395e+01 9.326631e+00 2.669548e+00
## racePctHisp agePct12t21 agePct12t29
## 7.941287e+00 1.444586e+01 2.764354e+01
## agePct16t24 agePct65up medIncome
## 1.397480e+01 1.183833e+01 3.398818e+04
## pctWWage pctWFarmSelf pctWInvInc
## 7.831075e+01 8.818609e-01 4.375618e+01
## pctWSocSec pctWPubAsst pctWRetire
## 2.641326e+01 6.801328e+00 1.597203e+01
## medFamInc perCapInc population
## 3.986140e+04 1.560413e+04 5.240550e+04
## householdsize MedRent MedRentPctHousInc
## 2.707358e+00 5.015167e+02 2.629932e+01
## MedOwnCostPctInc MedOwnCostPctIncNoMtg NumInShelters
## 2.099065e+01 1.301003e+01 6.617977e+01
## NumStreet
## 1.776332e+01
#outputs the standard deviation of variables
prin_comp$scale
## racePctWhite racepctblack racePctAsian
## 1.640931e+01 1.424479e+01 4.474748e+00
## racePctHisp agePct12t21 agePct12t29
## 1.458713e+01 4.519644e+00 6.182609e+00
## agePct16t24 agePct65up medIncome
## 5.972074e+00 4.777771e+00 1.342671e+04
## pctWWage pctWFarmSelf pctWInvInc
## 7.951909e+00 6.891607e-01 1.278843e+01
## pctWSocSec pctWPubAsst pctWRetire
## 8.295505e+00 4.701393e+00 4.621396e+00
## medFamInc perCapInc population
## 1.425296e+04 6.282913e+03 2.018996e+05
## householdsize MedRent MedRentPctHousInc
## 3.341921e-01 1.692934e+02 2.979418e+00
## MedOwnCostPctInc MedOwnCostPctIncNoMtg NumInShelters
## 2.988207e+00 1.419975e+00 5.632041e+02
## NumStreet
## 2.454917e+02
prin_comp$rotation
## PC1 PC2 PC3 PC4
## racePctWhite -0.194966074 0.15084528 0.220651048 -0.3277652840
## racepctblack 0.203684291 -0.03254688 -0.148569392 0.1898231662
## racePctAsian -0.113302630 -0.14632940 -0.161280369 0.2102543266
## racePctHisp 0.092927785 -0.17773566 -0.117130935 0.2566476895
## agePct12t21 0.135214178 -0.30494828 0.170766306 -0.1760782192
## agePct12t29 0.134059034 -0.34654532 0.128744816 -0.1476563493
## agePct16t24 0.128323502 -0.28574342 0.152021052 -0.1827543671
## agePct65up 0.100652078 0.38482150 -0.036706014 0.0361981154
## medIncome -0.372544960 -0.02789630 -0.067968569 0.0765699591
## pctWWage -0.253760879 -0.28712908 0.038895535 -0.0594061412
## pctWFarmSelf 0.004626702 -0.08639354 0.138292071 -0.1649668193
## pctWInvInc -0.324603433 0.10537184 0.045906483 -0.1430684741
## pctWSocSec 0.173671652 0.35678348 -0.008288950 0.0321832452
## pctWPubAsst 0.311729165 -0.02726399 -0.106239593 0.2030954297
## pctWRetire 0.041623339 0.32047565 -0.002933007 0.0008269341
## medFamInc -0.369383773 -0.01511754 -0.050868267 0.0400932609
## perCapInc -0.338601858 0.06292932 -0.085978796 0.0497343422
## population 0.044420629 -0.05515756 -0.483489393 -0.2906476826
## householdsize -0.014256156 -0.32058017 0.020669083 0.1163206471
## MedRent -0.318981316 -0.05822502 -0.147224098 0.2214387707
## MedRentPctHousInc 0.152490950 -0.09710396 -0.095530879 0.2365452367
## MedOwnCostPctInc -0.122531436 -0.12067960 -0.195960273 0.3597321132
## MedOwnCostPctIncNoMtg 0.049481648 0.08468490 -0.078366463 0.1558236402
## NumInShelters 0.043148452 -0.04131481 -0.483979529 -0.3116104268
## NumStreet 0.030563239 -0.03956464 -0.476275296 -0.3189393376
## PC5 PC6 PC7 PC8
## racePctWhite 0.068525656 -0.208849041 0.254117304 -0.2414013384
## racepctblack -0.051104496 0.537133929 -0.179114838 0.2787583120
## racePctAsian 0.120465948 -0.166138668 -0.492224447 0.0028516745
## racePctHisp -0.118464087 -0.502687193 0.126864837 -0.0322023390
## agePct12t21 0.336439510 0.023235062 -0.005008257 0.0981375407
## agePct12t29 0.266345884 0.094242980 -0.011497915 -0.0913016769
## agePct16t24 0.423927249 0.077685880 -0.037866170 0.0105478639
## agePct65up 0.250433382 -0.116320768 -0.027857256 0.1231629349
## medIncome 0.053985428 0.038840994 0.004907196 0.1261745180
## pctWWage -0.201491343 0.135505764 0.030624432 -0.1575196043
## pctWFarmSelf -0.122615531 -0.282596124 0.174117441 0.7872394782
## pctWInvInc 0.242775829 -0.000783563 -0.042686987 0.0952742331
## pctWSocSec 0.221405434 -0.122266003 0.047358136 0.1312105189
## pctWPubAsst -0.053857152 -0.071590623 -0.026110079 0.0868618787
## pctWRetire 0.260181655 -0.114440292 -0.140869141 -0.1654128197
## medFamInc 0.131614567 0.070954796 -0.016112279 0.1659986438
## perCapInc 0.139744080 0.088842366 -0.074585848 0.2520430539
## population 0.009998794 -0.021300977 0.015143606 -0.0174166757
## householdsize -0.010137946 -0.228771551 0.154331046 0.0405206803
## MedRent 0.187998413 -0.002097012 0.035946363 -0.0283767676
## MedRentPctHousInc 0.442632324 -0.091883331 0.056203843 -0.0049924651
## MedOwnCostPctInc 0.131655676 -0.113043586 0.286567821 -0.1170064794
## MedOwnCostPctIncNoMtg 0.117660753 0.366992133 0.681911501 0.0203715991
## NumInShelters 0.020569932 -0.020867391 0.043336724 0.0092908815
## NumStreet 0.023661749 -0.060174421 0.058329456 0.0001295475
## PC9 PC10 PC11 PC12
## racePctWhite 0.114671868 -0.020927471 0.066170878 -0.217652198
## racepctblack -0.130870025 -0.194932483 -0.145159488 0.142995801
## racePctAsian 0.254119384 0.642205481 -0.177401447 -0.197688381
## racePctHisp -0.168741020 0.072133863 0.337421747 0.414994475
## agePct12t21 -0.183874419 0.037232194 0.028624099 0.037833105
## agePct12t29 0.085148990 0.084806817 0.031202492 0.201028534
## agePct16t24 0.015256540 0.072666201 0.060245003 0.205419565
## agePct65up 0.042878890 0.062282509 0.022954255 0.268322186
## medIncome -0.217380164 -0.031428945 0.072497596 -0.084525536
## pctWWage 0.004508732 -0.026203359 -0.164923531 -0.007913330
## pctWFarmSelf 0.228715900 -0.010196405 -0.299324462 -0.068428764
## pctWInvInc -0.047117470 0.020614403 0.042043849 0.007002220
## pctWSocSec -0.125705873 0.109229836 0.005664149 0.113317620
## pctWPubAsst -0.118149600 0.043442739 0.326520891 -0.393329754
## pctWRetire -0.376706788 -0.056885797 -0.490197320 -0.030031284
## medFamInc -0.161739407 -0.020893808 0.174037452 -0.029999171
## perCapInc -0.054076935 -0.050292042 0.362946991 0.106233633
## population -0.008323445 -0.022233984 0.005498385 0.001397416
## householdsize -0.603293074 0.008729334 -0.248418130 -0.185068372
## MedRent 0.010802853 -0.054712934 -0.063406735 0.015621509
## MedRentPctHousInc 0.248529062 -0.426679225 0.095980800 -0.479134210
## MedOwnCostPctInc 0.322243890 -0.212426156 -0.337451938 0.305482481
## MedOwnCostPctIncNoMtg -0.019596564 0.522675086 -0.010842237 -0.120180739
## NumInShelters -0.018038223 0.003678950 -0.008589777 -0.013415971
## NumStreet -0.022742570 -0.004881236 -0.038171024 -0.016021068
## PC13 PC14 PC15 PC16
## racePctWhite 0.157844377 -0.09632605 0.095400357 -0.038432884
## racepctblack -0.010854771 0.07652033 -0.061032936 0.045699440
## racePctAsian 0.044314516 0.07186322 -0.027947816 -0.128032659
## racePctHisp -0.415060395 0.06163638 -0.083794492 -0.029733914
## agePct12t21 0.161832455 0.01203091 -0.037622354 -0.251347357
## agePct12t29 -0.087572927 -0.20438408 0.111142279 0.198541607
## agePct16t24 -0.002904052 -0.10291467 0.048849757 0.062685460
## agePct65up 0.150902367 0.23217675 0.107687800 0.090064393
## medIncome 0.054842571 -0.05200378 0.076102080 -0.096076362
## pctWWage -0.236502306 0.10088984 0.004277232 0.103872765
## pctWFarmSelf -0.170983769 -0.12975999 0.065940697 0.051430679
## pctWInvInc -0.088680994 0.12554054 -0.742182827 0.425220623
## pctWSocSec 0.209979613 0.18636609 0.117561565 0.118526584
## pctWPubAsst 0.185217688 -0.55912986 -0.138377387 0.314160309
## pctWRetire -0.429325885 -0.41562538 -0.005500567 -0.144247171
## medFamInc 0.018209427 -0.07903823 0.046597745 -0.162932484
## perCapInc -0.004012980 -0.20677954 0.077455946 -0.332116083
## population -0.064656908 0.01239655 -0.087852637 -0.005311972
## householdsize 0.315621806 0.26991760 -0.012546237 0.036861448
## MedRent -0.057493496 -0.03338195 0.514544782 0.586875744
## MedRentPctHousInc -0.280447295 0.31857828 -0.011213426 -0.131259420
## MedOwnCostPctInc 0.388259048 -0.27975965 -0.256317099 -0.133608991
## MedOwnCostPctIncNoMtg -0.208491175 0.01969186 -0.039184760 -0.079761533
## NumInShelters 0.014941251 -0.01337515 0.011052363 0.040571124
## NumStreet 0.069229506 0.02122940 0.096659849 -0.041782149
## PC17 PC18 PC19 PC20
## racePctWhite 0.0499129819 -0.018357086 0.007630113 -0.089484677
## racepctblack -0.0272255962 -0.047483827 0.034202334 -0.057525591
## racePctAsian 0.0015487341 0.005971051 0.003099429 -0.030778204
## racePctHisp -0.0623762475 -0.121267695 0.008079894 -0.058898997
## agePct12t21 -0.0149038416 -0.630449110 -0.053833023 -0.043780451
## agePct12t29 0.0298779103 0.452760106 0.056117263 0.045164860
## agePct16t24 0.0001540203 0.075984304 0.017856832 -0.093633115
## agePct65up 0.0372698642 0.150827728 0.001458749 -0.165901754
## medIncome 0.0211392021 -0.095254520 0.026616444 -0.252969860
## pctWWage -0.0412375161 0.113690181 0.135173999 -0.649722881
## pctWFarmSelf 0.0252363372 0.011365098 0.006645177 -0.017708325
## pctWInvInc -0.0965364770 -0.030539005 0.026695882 0.120497835
## pctWSocSec 0.0364960167 0.050564832 0.112088449 -0.400503190
## pctWPubAsst -0.0168553064 -0.002461139 0.077715597 -0.219231653
## pctWRetire -0.0208566069 0.009201923 -0.001240257 -0.001034393
## medFamInc 0.0028139769 0.010133182 0.043410291 -0.235534442
## perCapInc 0.0200008004 0.325219876 -0.021524559 0.217520469
## population 0.7933876380 -0.052214507 0.174089820 0.010915736
## householdsize 0.0673227115 0.335676210 -0.035238632 0.183959236
## MedRent 0.0434540727 -0.300032109 -0.049328887 0.219987452
## MedRentPctHousInc -0.0286396745 0.096919086 0.005913681 -0.052418748
## MedOwnCostPctInc 0.0034383946 -0.008591856 0.005068180 -0.077648108
## MedOwnCostPctIncNoMtg -0.0007336214 -0.006672151 -0.002163386 0.058196020
## NumInShelters -0.2394748318 0.069394911 -0.757284751 -0.147872248
## NumStreet -0.5307941966 -0.018870204 0.586609708 0.124138206
## PC21 PC22 PC23 PC24
## racePctWhite 0.141081308 -0.328433995 0.601715286 0.0175643080
## racepctblack 0.072178140 -0.275616926 0.548353587 0.0163293919
## racePctAsian 0.019767582 -0.114046902 0.157672986 0.0025570664
## racePctHisp -0.002368720 -0.102290926 0.244643748 -0.0037202019
## agePct12t21 0.187605489 -0.110903003 -0.147244883 0.0943371860
## agePct12t29 -0.204127360 0.063478365 0.137647732 0.0837258087
## agePct16t24 0.013110010 0.092265555 0.013135017 -0.2324431180
## agePct65up 0.595565091 0.371860305 0.081111594 0.0583781357
## medIncome -0.188635321 0.359109126 0.196219840 -0.6537622256
## pctWWage 0.330532608 -0.190440954 -0.237523633 0.0245100007
## pctWFarmSelf 0.003831973 0.002164665 0.017570504 0.0006841688
## pctWInvInc -0.010149414 -0.059093743 -0.004933547 -0.0112930813
## pctWSocSec -0.464960391 -0.419979450 -0.195577671 -0.0416653745
## pctWPubAsst 0.187595194 0.014020610 -0.034871730 0.0186418323
## pctWRetire 0.047252238 -0.015043748 -0.008572597 0.0153129232
## medFamInc -0.245315165 0.285934547 0.123379490 0.6921688455
## perCapInc 0.255985400 -0.428910034 -0.226453349 -0.1088144880
## population 0.010397383 0.001919353 -0.011466082 0.0056723172
## householdsize 0.103288577 -0.039500187 0.027186953 0.0385894992
## MedRent 0.058335887 -0.118646748 -0.048917374 0.0516522605
## MedRentPctHousInc -0.016253610 -0.004786793 0.002140612 -0.0165952872
## MedOwnCostPctInc -0.023549934 -0.027278375 -0.021421095 0.0083062037
## MedOwnCostPctIncNoMtg 0.045220591 0.022070851 -0.008817806 -0.0076497241
## NumInShelters -0.031751621 -0.028126384 0.004460585 0.0053154155
## NumStreet 0.020767072 0.025222310 0.005916927 -0.0068258961
## PC25
## racePctWhite 0.028388503
## racepctblack 0.019896881
## racePctAsian 0.002659131
## racePctHisp 0.011488703
## agePct12t21 -0.310194464
## agePct12t29 -0.543596508
## agePct16t24 0.715595891
## agePct65up -0.132949782
## medIncome -0.208645999
## pctWWage -0.030734698
## pctWFarmSelf 0.002586038
## pctWInvInc -0.046342680
## pctWSocSec -0.036039752
## pctWPubAsst 0.001550049
## pctWRetire -0.005066675
## medFamInc 0.148821816
## perCapInc -0.030134354
## population 0.009567918
## householdsize 0.066543978
## MedRent 0.021335601
## MedRentPctHousInc -0.031986851
## MedOwnCostPctInc 0.002077979
## MedOwnCostPctIncNoMtg 0.004406619
## NumInShelters -0.012265701
## NumStreet 0.001000551
prin_comp$rotation[1:5,1:4]
## PC1 PC2 PC3 PC4
## racePctWhite -0.19496607 0.15084528 0.2206510 -0.3277653
## racepctblack 0.20368429 -0.03254688 -0.1485694 0.1898232
## racePctAsian -0.11330263 -0.14632940 -0.1612804 0.2102543
## racePctHisp 0.09292779 -0.17773566 -0.1171309 0.2566477
## agePct12t21 0.13521418 -0.30494828 0.1707663 -0.1760782
dim(prin_comp$x)
## [1] 2214 25
biplot(prin_comp, scale = 0)
#compute standard deviation of each principal component
std_dev <- prin_comp$sdev
#compute variance
pr_var <- std_dev^2
#check variance of first 10 components
pr_var[1:10]
## [1] 6.4217887 4.8380623 3.0431018 2.3664283 1.7261087 1.3986183 1.0582751
## [8] 0.9289747 0.7533874 0.5484720
#proportion of variance explained
prop_varex <- pr_var/sum(pr_var)
prop_varex[1:20]
## [1] 0.256871549 0.193522492 0.121724074 0.094657132 0.069044348
## [6] 0.055944733 0.042331005 0.037158988 0.030135496 0.021938880
## [11] 0.018746060 0.016588006 0.010414913 0.008874618 0.005311199
## [16] 0.004348756 0.003554853 0.002200759 0.001872153 0.001626607
#scree plot
plot(prop_varex, xlab = "Principal Component",
ylab = "Proportion of Variance Explained",
type = "b")
#cumulative scree plot
plot(cumsum(prop_varex), xlab = "Principal Component",
ylab = "Cumulative Proportion of Variance Explained",
type = "b")
#Sample Code for Modification:
#Perform PCA -- Remember there can be no missing values
library(dplyr) #need to run to use the select function
keyPCAvars_new=prin_comp$x[,(1:15)]
head(keyPCAvars_new) #check to make sure have the PC that you selected
## PC1 PC2 PC3 PC4 PC5 PC6
## [1,] -6.048663 0.06392063 -0.52111938 0.61226269 0.8887188 0.09611968
## [2,] -2.306577 2.18140573 -0.04095126 0.04983085 1.2332492 -0.91691644
## [3,] -1.637678 0.40997114 0.43416473 -0.80399620 -0.8888893 -0.31661008
## [4,] 2.258904 2.60678628 0.21432680 0.11782464 0.7118478 -0.19680440
## [5,] 3.186172 -1.63831174 1.32049765 -0.75281208 3.1431048 -0.34143800
## [6,] 1.683914 -0.07209738 0.91569522 -2.19606991 0.3857232 -0.23670427
## PC7 PC8 PC9 PC10 PC11 PC12
## [1,] 0.3648790 1.6778271 -1.6018135 0.65091428 -0.4005154 -0.45397028
## [2,] -0.2437272 0.5402516 -0.7652250 -0.36548628 -0.6980016 -0.41135035
## [3,] -0.5570262 -0.1358803 1.0128444 -0.26160118 -0.1814805 0.27006931
## [4,] 0.8539024 -0.9946885 0.5965281 0.33968933 0.5007011 -0.73847415
## [5,] 0.4407419 -0.8261437 0.8899254 -0.38373083 0.3540222 -0.22478513
## [6,] -0.6169786 -0.2463915 0.6686113 -0.06896418 0.6105050 0.04132167
## PC13 PC14 PC15
## [1,] -0.38145987 -0.03803769 0.6438674
## [2,] -0.26719834 0.55632787 -0.1718497
## [3,] 0.01383782 -0.09282256 -0.5730750
## [4,] 0.47468628 0.02323714 -0.1869453
## [5,] 1.04362904 -0.18119195 -0.5719506
## [6,] 0.14771502 0.15257988 0.1653637
Crime_vs_PCAvars <- data.frame(cd2$robbbPerPop, keyPCAvars_new) #appending PC variables to original dataframe
str(Crime_vs_PCAvars) #to confirm that the cluster assignments have been appended
## 'data.frame': 2214 obs. of 16 variables:
## $ cd2.robbbPerPop: num 8.2 21.3 154.9 57.9 32 ...
## $ PC1 : num -6.05 -2.31 -1.64 2.26 3.19 ...
## $ PC2 : num 0.0639 2.1814 0.41 2.6068 -1.6383 ...
## $ PC3 : num -0.521 -0.041 0.434 0.214 1.32 ...
## $ PC4 : num 0.6123 0.0498 -0.804 0.1178 -0.7528 ...
## $ PC5 : num 0.889 1.233 -0.889 0.712 3.143 ...
## $ PC6 : num 0.0961 -0.9169 -0.3166 -0.1968 -0.3414 ...
## $ PC7 : num 0.365 -0.244 -0.557 0.854 0.441 ...
## $ PC8 : num 1.678 0.54 -0.136 -0.995 -0.826 ...
## $ PC9 : num -1.602 -0.765 1.013 0.597 0.89 ...
## $ PC10 : num 0.651 -0.365 -0.262 0.34 -0.384 ...
## $ PC11 : num -0.401 -0.698 -0.181 0.501 0.354 ...
## $ PC12 : num -0.454 -0.411 0.27 -0.738 -0.225 ...
## $ PC13 : num -0.3815 -0.2672 0.0138 0.4747 1.0436 ...
## $ PC14 : num -0.038 0.5563 -0.0928 0.0232 -0.1812 ...
## $ PC15 : num 0.644 -0.172 -0.573 -0.187 -0.572 ...
corr_df <- cor(Crime_vs_PCAvars) #correlation matrix to see which vars are correlated with salary
corr_df #prints correlation matrix
## cd2.robbbPerPop PC1 PC2 PC3
## cd2.robbbPerPop 1.00000000 4.055421e-01 -1.304368e-01 -4.567068e-01
## PC1 0.40554206 1.000000e+00 3.496673e-17 3.017358e-16
## PC2 -0.13043681 3.496673e-17 1.000000e+00 1.694106e-16
## PC3 -0.45670684 3.017358e-16 1.694106e-16 1.000000e+00
## PC4 0.31283391 5.525851e-16 -2.754130e-16 -3.814113e-16
## PC5 -0.02306963 -8.017401e-17 -6.554519e-16 3.665724e-17
## PC6 0.23850997 -1.544453e-16 4.022108e-16 6.805828e-17
## PC7 -0.17920554 7.909810e-18 -3.923315e-16 5.319084e-17
## PC8 0.06102357 -4.550797e-16 1.881573e-16 8.427131e-16
## PC9 0.01070444 7.100241e-16 3.844164e-16 -4.610575e-16
## PC10 -0.08074695 4.914855e-16 3.222127e-16 4.675904e-16
## PC11 0.08363364 -5.029463e-16 -3.031920e-17 -8.956546e-17
## PC12 0.10644314 8.209630e-16 -1.878157e-16 -7.468837e-17
## PC13 -0.06538260 9.745620e-16 9.635939e-16 -1.088160e-16
## PC14 -0.01767901 -4.880933e-16 -5.111330e-17 7.887764e-16
## PC15 -0.03725020 -5.667529e-16 -1.107999e-15 -3.737522e-17
## PC4 PC5 PC6 PC7
## cd2.robbbPerPop 3.128339e-01 -2.306963e-02 2.385100e-01 -1.792055e-01
## PC1 5.525851e-16 -8.017401e-17 -1.544453e-16 7.909810e-18
## PC2 -2.754130e-16 -6.554519e-16 4.022108e-16 -3.923315e-16
## PC3 -3.814113e-16 3.665724e-17 6.805828e-17 5.319084e-17
## PC4 1.000000e+00 -1.982472e-17 -1.322068e-15 1.090921e-15
## PC5 -1.982472e-17 1.000000e+00 -1.546357e-17 1.212065e-16
## PC6 -1.322068e-15 -1.546357e-17 1.000000e+00 -1.220096e-16
## PC7 1.090921e-15 1.212065e-16 -1.220096e-16 1.000000e+00
## PC8 -1.021859e-15 -3.082779e-16 -4.480198e-16 -6.316029e-16
## PC9 3.076937e-16 3.139589e-17 1.968417e-16 -4.183258e-16
## PC10 -4.927518e-16 5.393578e-16 8.672927e-17 3.700685e-16
## PC11 7.155934e-17 -1.252934e-16 -6.539622e-16 5.789101e-17
## PC12 -2.063056e-16 9.629591e-16 -1.197839e-15 -1.661184e-16
## PC13 -6.443009e-16 7.084131e-16 8.761966e-16 -2.098495e-16
## PC14 -8.299632e-17 -7.297101e-16 6.628259e-16 4.483261e-16
## PC15 4.178229e-16 -2.097534e-16 7.065229e-16 -3.129904e-16
## PC8 PC9 PC10 PC11
## cd2.robbbPerPop 6.102357e-02 1.070444e-02 -8.074695e-02 8.363364e-02
## PC1 -4.550797e-16 7.100241e-16 4.914855e-16 -5.029463e-16
## PC2 1.881573e-16 3.844164e-16 3.222127e-16 -3.031920e-17
## PC3 8.427131e-16 -4.610575e-16 4.675904e-16 -8.956546e-17
## PC4 -1.021859e-15 3.076937e-16 -4.927518e-16 7.155934e-17
## PC5 -3.082779e-16 3.139589e-17 5.393578e-16 -1.252934e-16
## PC6 -4.480198e-16 1.968417e-16 8.672927e-17 -6.539622e-16
## PC7 -6.316029e-16 -4.183258e-16 3.700685e-16 5.789101e-17
## PC8 1.000000e+00 -1.811550e-16 7.591187e-17 -1.257134e-15
## PC9 -1.811550e-16 1.000000e+00 -4.238343e-16 1.109276e-16
## PC10 7.591187e-17 -4.238343e-16 1.000000e+00 -2.003328e-17
## PC11 -1.257134e-15 1.109276e-16 -2.003328e-17 1.000000e+00
## PC12 2.080204e-16 3.276420e-16 8.668513e-16 -5.201015e-16
## PC13 -3.333472e-16 -1.455608e-16 5.820771e-19 8.343204e-16
## PC14 -5.152261e-16 -6.968134e-16 6.017865e-16 6.380302e-16
## PC15 1.300188e-15 -3.905049e-16 -7.047022e-16 2.721442e-16
## PC12 PC13 PC14 PC15
## cd2.robbbPerPop 1.064431e-01 -6.538260e-02 -1.767901e-02 -3.725020e-02
## PC1 8.209630e-16 9.745620e-16 -4.880933e-16 -5.667529e-16
## PC2 -1.878157e-16 9.635939e-16 -5.111330e-17 -1.107999e-15
## PC3 -7.468837e-17 -1.088160e-16 7.887764e-16 -3.737522e-17
## PC4 -2.063056e-16 -6.443009e-16 -8.299632e-17 4.178229e-16
## PC5 9.629591e-16 7.084131e-16 -7.297101e-16 -2.097534e-16
## PC6 -1.197839e-15 8.761966e-16 6.628259e-16 7.065229e-16
## PC7 -1.661184e-16 -2.098495e-16 4.483261e-16 -3.129904e-16
## PC8 2.080204e-16 -3.333472e-16 -5.152261e-16 1.300188e-15
## PC9 3.276420e-16 -1.455608e-16 -6.968134e-16 -3.905049e-16
## PC10 8.668513e-16 5.820771e-19 6.017865e-16 -7.047022e-16
## PC11 -5.201015e-16 8.343204e-16 6.380302e-16 2.721442e-16
## PC12 1.000000e+00 1.753261e-15 -1.442269e-15 4.167717e-16
## PC13 1.753261e-15 1.000000e+00 -6.837209e-16 -1.026370e-15
## PC14 -1.442269e-15 -6.837209e-16 1.000000e+00 -7.171544e-16
## PC15 4.167717e-16 -1.026370e-15 -7.171544e-16 1.000000e+00
#make a correlation heatmap
library(ggplot2)
library(reshape2)
melted_corr <- melt(corr_df)
ggplot(data = melted_corr, aes(x=Var1, y=Var2, fill=value)) + geom_tile() + scale_fill_gradient(low="yellow", high="blue")