Research Question

The focus of the research will be on how robbery in different communities in the United States could be explained by some of the societal factors such as ethnicity, income, age, population etc. and using an unsupervised learning algorithm to rank or group communities (into high, medium or low) by their susceptibility to robberies.

library(readr)
crimedata <- read_csv("C:/Users/shreejit/Downloads/crimedata.csv")

cd <- crimedata

cd[cd=="?"] <- NA



# Predictor Variables and the dependant Variable 
myvars2 <- c (
  "robbbPerPop",
  "racePctWhite",
  "racepctblack",
  "racePctAsian",
  "racePctHisp",  
  "agePct12t21",
  "agePct12t29",
  "agePct16t24",
  "agePct65up",
  "medIncome",
  "pctWWage",
  "pctWFarmSelf",
  "pctWInvInc",
  "pctWSocSec",
  "pctWPubAsst",
  "pctWRetire",
  "medFamInc",
  "perCapInc",
  "population",
  "householdsize",
  "MedRent",
  "MedRentPctHousInc",
  "MedOwnCostPctInc",
  "MedOwnCostPctIncNoMtg",
  "NumInShelters",
  "NumStreet"
)

cd2 <- cd[myvars2]
#converting dependent variable from Character to numeric 
cd2$robbbPerPop <- as.numeric(as.character(cd2$robbbPerPop))
cd2 <- na.omit(cd2)
str(cd2)
## Classes 'tbl_df', 'tbl' and 'data.frame':    2214 obs. of  26 variables:
##  $ robbbPerPop          : num  8.2 21.3 154.9 57.9 32 ...
##  $ racePctWhite         : num  91.8 95.6 94.3 97.3 89.2 ...
##  $ racepctblack         : num  1.37 0.8 0.74 1.7 0.53 ...
##  $ racePctAsian         : num  6.5 3.44 3.43 0.5 1.17 0.9 1.47 0.4 1.25 0.92 ...
##  $ racePctHisp          : num  1.88 0.85 2.35 0.7 0.52 ...
##  $ agePct12t21          : num  12.5 11 11.4 12.6 24.5 ...
##  $ agePct12t29          : num  21.4 21.3 25.9 25.2 40.5 ...
##  $ agePct16t24          : num  10.9 10.5 11 12.2 28.7 ...
##  $ agePct65up           : num  11.3 17.2 10.3 17.6 12.6 ...
##  $ medIncome            : int  75122 47917 35669 20580 17390 21577 42805 23221 25326 17852 ...
##  $ pctWWage             : num  89.2 79 82 68.2 69.3 ...
##  $ pctWFarmSelf         : num  1.55 1.11 1.15 0.24 0.55 1 0.39 0.67 2.93 0.86 ...
##  $ pctWInvInc           : num  70.2 64.1 55.7 39 42.8 ...
##  $ pctWSocSec           : num  23.6 35.5 22.2 39.5 32.2 ...
##  $ pctWPubAsst          : num  1.03 2.75 2.94 11.71 11.21 ...
##  $ pctWRetire           : num  18.4 22.9 14.6 18.3 14.4 ...
##  $ medFamInc            : int  79584 55323 42112 26501 24018 27705 50394 28901 34269 24058 ...
##  $ perCapInc            : int  29711 20148 16946 10810 8483 11878 18193 12161 13554 10195 ...
##  $ population           : int  11980 23123 29344 16656 11245 140494 28700 59459 74111 103590 ...
##  $ householdsize        : num  3.1 2.82 2.43 2.4 2.76 2.45 2.6 2.45 2.46 2.62 ...
##  $ MedRent              : int  1001 627 484 333 332 340 736 338 355 353 ...
##  $ MedRentPctHousInc    : num  23.8 27.6 24.1 28.7 32.2 26.4 24.4 26.3 25.2 29.6 ...
##  $ MedOwnCostPctInc     : num  21.1 20.7 21.7 20.6 23.2 17.3 20.8 15.1 20.7 19.4 ...
##  $ MedOwnCostPctIncNoMtg: num  14 12.5 11.6 14.5 12.9 11.7 12.5 12.2 12.8 13 ...
##  $ NumInShelters        : int  11 0 16 0 2 327 0 21 125 43 ...
##  $ NumStreet            : int  0 0 0 0 0 4 0 0 15 4 ...
##  - attr(*, "na.action")=Class 'omit'  Named int 819
##   .. ..- attr(*, "names")= chr "819"
# All predictor variables for PCA  

myvars <- c (
"racePctWhite",
"racepctblack",
"racePctAsian",
"racePctHisp",  
"agePct12t21",
"agePct12t29",
"agePct16t24",
"agePct65up",
"medIncome",
"pctWWage",
"pctWFarmSelf",
"pctWInvInc",
"pctWSocSec",
"pctWPubAsst",
"pctWRetire",
"medFamInc",
"perCapInc",
"population",
"householdsize",
"MedRent",
"MedRentPctHousInc",
"MedOwnCostPctInc",
"MedOwnCostPctIncNoMtg",
"NumInShelters",
"NumStreet"
)

newdata <- cd2[myvars]

# Getting rid of all rows with Null values 
my_data <- na.omit(newdata)

#check available variables
colnames(my_data)
##  [1] "racePctWhite"          "racepctblack"         
##  [3] "racePctAsian"          "racePctHisp"          
##  [5] "agePct12t21"           "agePct12t29"          
##  [7] "agePct16t24"           "agePct65up"           
##  [9] "medIncome"             "pctWWage"             
## [11] "pctWFarmSelf"          "pctWInvInc"           
## [13] "pctWSocSec"            "pctWPubAsst"          
## [15] "pctWRetire"            "medFamInc"            
## [17] "perCapInc"             "population"           
## [19] "householdsize"         "MedRent"              
## [21] "MedRentPctHousInc"     "MedOwnCostPctInc"     
## [23] "MedOwnCostPctIncNoMtg" "NumInShelters"        
## [25] "NumStreet"
#check variable class
str(my_data)
## Classes 'tbl_df', 'tbl' and 'data.frame':    2214 obs. of  25 variables:
##  $ racePctWhite         : num  91.8 95.6 94.3 97.3 89.2 ...
##  $ racepctblack         : num  1.37 0.8 0.74 1.7 0.53 ...
##  $ racePctAsian         : num  6.5 3.44 3.43 0.5 1.17 0.9 1.47 0.4 1.25 0.92 ...
##  $ racePctHisp          : num  1.88 0.85 2.35 0.7 0.52 ...
##  $ agePct12t21          : num  12.5 11 11.4 12.6 24.5 ...
##  $ agePct12t29          : num  21.4 21.3 25.9 25.2 40.5 ...
##  $ agePct16t24          : num  10.9 10.5 11 12.2 28.7 ...
##  $ agePct65up           : num  11.3 17.2 10.3 17.6 12.6 ...
##  $ medIncome            : int  75122 47917 35669 20580 17390 21577 42805 23221 25326 17852 ...
##  $ pctWWage             : num  89.2 79 82 68.2 69.3 ...
##  $ pctWFarmSelf         : num  1.55 1.11 1.15 0.24 0.55 1 0.39 0.67 2.93 0.86 ...
##  $ pctWInvInc           : num  70.2 64.1 55.7 39 42.8 ...
##  $ pctWSocSec           : num  23.6 35.5 22.2 39.5 32.2 ...
##  $ pctWPubAsst          : num  1.03 2.75 2.94 11.71 11.21 ...
##  $ pctWRetire           : num  18.4 22.9 14.6 18.3 14.4 ...
##  $ medFamInc            : int  79584 55323 42112 26501 24018 27705 50394 28901 34269 24058 ...
##  $ perCapInc            : int  29711 20148 16946 10810 8483 11878 18193 12161 13554 10195 ...
##  $ population           : int  11980 23123 29344 16656 11245 140494 28700 59459 74111 103590 ...
##  $ householdsize        : num  3.1 2.82 2.43 2.4 2.76 2.45 2.6 2.45 2.46 2.62 ...
##  $ MedRent              : int  1001 627 484 333 332 340 736 338 355 353 ...
##  $ MedRentPctHousInc    : num  23.8 27.6 24.1 28.7 32.2 26.4 24.4 26.3 25.2 29.6 ...
##  $ MedOwnCostPctInc     : num  21.1 20.7 21.7 20.6 23.2 17.3 20.8 15.1 20.7 19.4 ...
##  $ MedOwnCostPctIncNoMtg: num  14 12.5 11.6 14.5 12.9 11.7 12.5 12.2 12.8 13 ...
##  $ NumInShelters        : int  11 0 16 0 2 327 0 21 125 43 ...
##  $ NumStreet            : int  0 0 0 0 0 4 0 0 15 4 ...
#principal component analysis

prin_comp <- prcomp(my_data, scale. = T)
names(prin_comp)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
#outputs the mean of variables
prin_comp$center
##          racePctWhite          racepctblack          racePctAsian 
##          8.399395e+01          9.326631e+00          2.669548e+00 
##           racePctHisp           agePct12t21           agePct12t29 
##          7.941287e+00          1.444586e+01          2.764354e+01 
##           agePct16t24            agePct65up             medIncome 
##          1.397480e+01          1.183833e+01          3.398818e+04 
##              pctWWage          pctWFarmSelf            pctWInvInc 
##          7.831075e+01          8.818609e-01          4.375618e+01 
##            pctWSocSec           pctWPubAsst            pctWRetire 
##          2.641326e+01          6.801328e+00          1.597203e+01 
##             medFamInc             perCapInc            population 
##          3.986140e+04          1.560413e+04          5.240550e+04 
##         householdsize               MedRent     MedRentPctHousInc 
##          2.707358e+00          5.015167e+02          2.629932e+01 
##      MedOwnCostPctInc MedOwnCostPctIncNoMtg         NumInShelters 
##          2.099065e+01          1.301003e+01          6.617977e+01 
##             NumStreet 
##          1.776332e+01
#outputs the standard deviation of variables
prin_comp$scale
##          racePctWhite          racepctblack          racePctAsian 
##          1.640931e+01          1.424479e+01          4.474748e+00 
##           racePctHisp           agePct12t21           agePct12t29 
##          1.458713e+01          4.519644e+00          6.182609e+00 
##           agePct16t24            agePct65up             medIncome 
##          5.972074e+00          4.777771e+00          1.342671e+04 
##              pctWWage          pctWFarmSelf            pctWInvInc 
##          7.951909e+00          6.891607e-01          1.278843e+01 
##            pctWSocSec           pctWPubAsst            pctWRetire 
##          8.295505e+00          4.701393e+00          4.621396e+00 
##             medFamInc             perCapInc            population 
##          1.425296e+04          6.282913e+03          2.018996e+05 
##         householdsize               MedRent     MedRentPctHousInc 
##          3.341921e-01          1.692934e+02          2.979418e+00 
##      MedOwnCostPctInc MedOwnCostPctIncNoMtg         NumInShelters 
##          2.988207e+00          1.419975e+00          5.632041e+02 
##             NumStreet 
##          2.454917e+02
prin_comp$rotation
##                                PC1         PC2          PC3           PC4
## racePctWhite          -0.194966074  0.15084528  0.220651048 -0.3277652840
## racepctblack           0.203684291 -0.03254688 -0.148569392  0.1898231662
## racePctAsian          -0.113302630 -0.14632940 -0.161280369  0.2102543266
## racePctHisp            0.092927785 -0.17773566 -0.117130935  0.2566476895
## agePct12t21            0.135214178 -0.30494828  0.170766306 -0.1760782192
## agePct12t29            0.134059034 -0.34654532  0.128744816 -0.1476563493
## agePct16t24            0.128323502 -0.28574342  0.152021052 -0.1827543671
## agePct65up             0.100652078  0.38482150 -0.036706014  0.0361981154
## medIncome             -0.372544960 -0.02789630 -0.067968569  0.0765699591
## pctWWage              -0.253760879 -0.28712908  0.038895535 -0.0594061412
## pctWFarmSelf           0.004626702 -0.08639354  0.138292071 -0.1649668193
## pctWInvInc            -0.324603433  0.10537184  0.045906483 -0.1430684741
## pctWSocSec             0.173671652  0.35678348 -0.008288950  0.0321832452
## pctWPubAsst            0.311729165 -0.02726399 -0.106239593  0.2030954297
## pctWRetire             0.041623339  0.32047565 -0.002933007  0.0008269341
## medFamInc             -0.369383773 -0.01511754 -0.050868267  0.0400932609
## perCapInc             -0.338601858  0.06292932 -0.085978796  0.0497343422
## population             0.044420629 -0.05515756 -0.483489393 -0.2906476826
## householdsize         -0.014256156 -0.32058017  0.020669083  0.1163206471
## MedRent               -0.318981316 -0.05822502 -0.147224098  0.2214387707
## MedRentPctHousInc      0.152490950 -0.09710396 -0.095530879  0.2365452367
## MedOwnCostPctInc      -0.122531436 -0.12067960 -0.195960273  0.3597321132
## MedOwnCostPctIncNoMtg  0.049481648  0.08468490 -0.078366463  0.1558236402
## NumInShelters          0.043148452 -0.04131481 -0.483979529 -0.3116104268
## NumStreet              0.030563239 -0.03956464 -0.476275296 -0.3189393376
##                                PC5          PC6          PC7           PC8
## racePctWhite           0.068525656 -0.208849041  0.254117304 -0.2414013384
## racepctblack          -0.051104496  0.537133929 -0.179114838  0.2787583120
## racePctAsian           0.120465948 -0.166138668 -0.492224447  0.0028516745
## racePctHisp           -0.118464087 -0.502687193  0.126864837 -0.0322023390
## agePct12t21            0.336439510  0.023235062 -0.005008257  0.0981375407
## agePct12t29            0.266345884  0.094242980 -0.011497915 -0.0913016769
## agePct16t24            0.423927249  0.077685880 -0.037866170  0.0105478639
## agePct65up             0.250433382 -0.116320768 -0.027857256  0.1231629349
## medIncome              0.053985428  0.038840994  0.004907196  0.1261745180
## pctWWage              -0.201491343  0.135505764  0.030624432 -0.1575196043
## pctWFarmSelf          -0.122615531 -0.282596124  0.174117441  0.7872394782
## pctWInvInc             0.242775829 -0.000783563 -0.042686987  0.0952742331
## pctWSocSec             0.221405434 -0.122266003  0.047358136  0.1312105189
## pctWPubAsst           -0.053857152 -0.071590623 -0.026110079  0.0868618787
## pctWRetire             0.260181655 -0.114440292 -0.140869141 -0.1654128197
## medFamInc              0.131614567  0.070954796 -0.016112279  0.1659986438
## perCapInc              0.139744080  0.088842366 -0.074585848  0.2520430539
## population             0.009998794 -0.021300977  0.015143606 -0.0174166757
## householdsize         -0.010137946 -0.228771551  0.154331046  0.0405206803
## MedRent                0.187998413 -0.002097012  0.035946363 -0.0283767676
## MedRentPctHousInc      0.442632324 -0.091883331  0.056203843 -0.0049924651
## MedOwnCostPctInc       0.131655676 -0.113043586  0.286567821 -0.1170064794
## MedOwnCostPctIncNoMtg  0.117660753  0.366992133  0.681911501  0.0203715991
## NumInShelters          0.020569932 -0.020867391  0.043336724  0.0092908815
## NumStreet              0.023661749 -0.060174421  0.058329456  0.0001295475
##                                PC9         PC10         PC11         PC12
## racePctWhite           0.114671868 -0.020927471  0.066170878 -0.217652198
## racepctblack          -0.130870025 -0.194932483 -0.145159488  0.142995801
## racePctAsian           0.254119384  0.642205481 -0.177401447 -0.197688381
## racePctHisp           -0.168741020  0.072133863  0.337421747  0.414994475
## agePct12t21           -0.183874419  0.037232194  0.028624099  0.037833105
## agePct12t29            0.085148990  0.084806817  0.031202492  0.201028534
## agePct16t24            0.015256540  0.072666201  0.060245003  0.205419565
## agePct65up             0.042878890  0.062282509  0.022954255  0.268322186
## medIncome             -0.217380164 -0.031428945  0.072497596 -0.084525536
## pctWWage               0.004508732 -0.026203359 -0.164923531 -0.007913330
## pctWFarmSelf           0.228715900 -0.010196405 -0.299324462 -0.068428764
## pctWInvInc            -0.047117470  0.020614403  0.042043849  0.007002220
## pctWSocSec            -0.125705873  0.109229836  0.005664149  0.113317620
## pctWPubAsst           -0.118149600  0.043442739  0.326520891 -0.393329754
## pctWRetire            -0.376706788 -0.056885797 -0.490197320 -0.030031284
## medFamInc             -0.161739407 -0.020893808  0.174037452 -0.029999171
## perCapInc             -0.054076935 -0.050292042  0.362946991  0.106233633
## population            -0.008323445 -0.022233984  0.005498385  0.001397416
## householdsize         -0.603293074  0.008729334 -0.248418130 -0.185068372
## MedRent                0.010802853 -0.054712934 -0.063406735  0.015621509
## MedRentPctHousInc      0.248529062 -0.426679225  0.095980800 -0.479134210
## MedOwnCostPctInc       0.322243890 -0.212426156 -0.337451938  0.305482481
## MedOwnCostPctIncNoMtg -0.019596564  0.522675086 -0.010842237 -0.120180739
## NumInShelters         -0.018038223  0.003678950 -0.008589777 -0.013415971
## NumStreet             -0.022742570 -0.004881236 -0.038171024 -0.016021068
##                               PC13        PC14         PC15         PC16
## racePctWhite           0.157844377 -0.09632605  0.095400357 -0.038432884
## racepctblack          -0.010854771  0.07652033 -0.061032936  0.045699440
## racePctAsian           0.044314516  0.07186322 -0.027947816 -0.128032659
## racePctHisp           -0.415060395  0.06163638 -0.083794492 -0.029733914
## agePct12t21            0.161832455  0.01203091 -0.037622354 -0.251347357
## agePct12t29           -0.087572927 -0.20438408  0.111142279  0.198541607
## agePct16t24           -0.002904052 -0.10291467  0.048849757  0.062685460
## agePct65up             0.150902367  0.23217675  0.107687800  0.090064393
## medIncome              0.054842571 -0.05200378  0.076102080 -0.096076362
## pctWWage              -0.236502306  0.10088984  0.004277232  0.103872765
## pctWFarmSelf          -0.170983769 -0.12975999  0.065940697  0.051430679
## pctWInvInc            -0.088680994  0.12554054 -0.742182827  0.425220623
## pctWSocSec             0.209979613  0.18636609  0.117561565  0.118526584
## pctWPubAsst            0.185217688 -0.55912986 -0.138377387  0.314160309
## pctWRetire            -0.429325885 -0.41562538 -0.005500567 -0.144247171
## medFamInc              0.018209427 -0.07903823  0.046597745 -0.162932484
## perCapInc             -0.004012980 -0.20677954  0.077455946 -0.332116083
## population            -0.064656908  0.01239655 -0.087852637 -0.005311972
## householdsize          0.315621806  0.26991760 -0.012546237  0.036861448
## MedRent               -0.057493496 -0.03338195  0.514544782  0.586875744
## MedRentPctHousInc     -0.280447295  0.31857828 -0.011213426 -0.131259420
## MedOwnCostPctInc       0.388259048 -0.27975965 -0.256317099 -0.133608991
## MedOwnCostPctIncNoMtg -0.208491175  0.01969186 -0.039184760 -0.079761533
## NumInShelters          0.014941251 -0.01337515  0.011052363  0.040571124
## NumStreet              0.069229506  0.02122940  0.096659849 -0.041782149
##                                PC17         PC18         PC19         PC20
## racePctWhite           0.0499129819 -0.018357086  0.007630113 -0.089484677
## racepctblack          -0.0272255962 -0.047483827  0.034202334 -0.057525591
## racePctAsian           0.0015487341  0.005971051  0.003099429 -0.030778204
## racePctHisp           -0.0623762475 -0.121267695  0.008079894 -0.058898997
## agePct12t21           -0.0149038416 -0.630449110 -0.053833023 -0.043780451
## agePct12t29            0.0298779103  0.452760106  0.056117263  0.045164860
## agePct16t24            0.0001540203  0.075984304  0.017856832 -0.093633115
## agePct65up             0.0372698642  0.150827728  0.001458749 -0.165901754
## medIncome              0.0211392021 -0.095254520  0.026616444 -0.252969860
## pctWWage              -0.0412375161  0.113690181  0.135173999 -0.649722881
## pctWFarmSelf           0.0252363372  0.011365098  0.006645177 -0.017708325
## pctWInvInc            -0.0965364770 -0.030539005  0.026695882  0.120497835
## pctWSocSec             0.0364960167  0.050564832  0.112088449 -0.400503190
## pctWPubAsst           -0.0168553064 -0.002461139  0.077715597 -0.219231653
## pctWRetire            -0.0208566069  0.009201923 -0.001240257 -0.001034393
## medFamInc              0.0028139769  0.010133182  0.043410291 -0.235534442
## perCapInc              0.0200008004  0.325219876 -0.021524559  0.217520469
## population             0.7933876380 -0.052214507  0.174089820  0.010915736
## householdsize          0.0673227115  0.335676210 -0.035238632  0.183959236
## MedRent                0.0434540727 -0.300032109 -0.049328887  0.219987452
## MedRentPctHousInc     -0.0286396745  0.096919086  0.005913681 -0.052418748
## MedOwnCostPctInc       0.0034383946 -0.008591856  0.005068180 -0.077648108
## MedOwnCostPctIncNoMtg -0.0007336214 -0.006672151 -0.002163386  0.058196020
## NumInShelters         -0.2394748318  0.069394911 -0.757284751 -0.147872248
## NumStreet             -0.5307941966 -0.018870204  0.586609708  0.124138206
##                               PC21         PC22         PC23          PC24
## racePctWhite           0.141081308 -0.328433995  0.601715286  0.0175643080
## racepctblack           0.072178140 -0.275616926  0.548353587  0.0163293919
## racePctAsian           0.019767582 -0.114046902  0.157672986  0.0025570664
## racePctHisp           -0.002368720 -0.102290926  0.244643748 -0.0037202019
## agePct12t21            0.187605489 -0.110903003 -0.147244883  0.0943371860
## agePct12t29           -0.204127360  0.063478365  0.137647732  0.0837258087
## agePct16t24            0.013110010  0.092265555  0.013135017 -0.2324431180
## agePct65up             0.595565091  0.371860305  0.081111594  0.0583781357
## medIncome             -0.188635321  0.359109126  0.196219840 -0.6537622256
## pctWWage               0.330532608 -0.190440954 -0.237523633  0.0245100007
## pctWFarmSelf           0.003831973  0.002164665  0.017570504  0.0006841688
## pctWInvInc            -0.010149414 -0.059093743 -0.004933547 -0.0112930813
## pctWSocSec            -0.464960391 -0.419979450 -0.195577671 -0.0416653745
## pctWPubAsst            0.187595194  0.014020610 -0.034871730  0.0186418323
## pctWRetire             0.047252238 -0.015043748 -0.008572597  0.0153129232
## medFamInc             -0.245315165  0.285934547  0.123379490  0.6921688455
## perCapInc              0.255985400 -0.428910034 -0.226453349 -0.1088144880
## population             0.010397383  0.001919353 -0.011466082  0.0056723172
## householdsize          0.103288577 -0.039500187  0.027186953  0.0385894992
## MedRent                0.058335887 -0.118646748 -0.048917374  0.0516522605
## MedRentPctHousInc     -0.016253610 -0.004786793  0.002140612 -0.0165952872
## MedOwnCostPctInc      -0.023549934 -0.027278375 -0.021421095  0.0083062037
## MedOwnCostPctIncNoMtg  0.045220591  0.022070851 -0.008817806 -0.0076497241
## NumInShelters         -0.031751621 -0.028126384  0.004460585  0.0053154155
## NumStreet              0.020767072  0.025222310  0.005916927 -0.0068258961
##                               PC25
## racePctWhite           0.028388503
## racepctblack           0.019896881
## racePctAsian           0.002659131
## racePctHisp            0.011488703
## agePct12t21           -0.310194464
## agePct12t29           -0.543596508
## agePct16t24            0.715595891
## agePct65up            -0.132949782
## medIncome             -0.208645999
## pctWWage              -0.030734698
## pctWFarmSelf           0.002586038
## pctWInvInc            -0.046342680
## pctWSocSec            -0.036039752
## pctWPubAsst            0.001550049
## pctWRetire            -0.005066675
## medFamInc              0.148821816
## perCapInc             -0.030134354
## population             0.009567918
## householdsize          0.066543978
## MedRent                0.021335601
## MedRentPctHousInc     -0.031986851
## MedOwnCostPctInc       0.002077979
## MedOwnCostPctIncNoMtg  0.004406619
## NumInShelters         -0.012265701
## NumStreet              0.001000551
prin_comp$rotation[1:5,1:4]
##                      PC1         PC2        PC3        PC4
## racePctWhite -0.19496607  0.15084528  0.2206510 -0.3277653
## racepctblack  0.20368429 -0.03254688 -0.1485694  0.1898232
## racePctAsian -0.11330263 -0.14632940 -0.1612804  0.2102543
## racePctHisp   0.09292779 -0.17773566 -0.1171309  0.2566477
## agePct12t21   0.13521418 -0.30494828  0.1707663 -0.1760782
dim(prin_comp$x)
## [1] 2214   25
biplot(prin_comp, scale = 0)

#compute standard deviation of each principal component
std_dev <- prin_comp$sdev

#compute variance
pr_var <- std_dev^2

#check variance of first 10 components
pr_var[1:10]
##  [1] 6.4217887 4.8380623 3.0431018 2.3664283 1.7261087 1.3986183 1.0582751
##  [8] 0.9289747 0.7533874 0.5484720
#proportion of variance explained
prop_varex <- pr_var/sum(pr_var)
prop_varex[1:20]
##  [1] 0.256871549 0.193522492 0.121724074 0.094657132 0.069044348
##  [6] 0.055944733 0.042331005 0.037158988 0.030135496 0.021938880
## [11] 0.018746060 0.016588006 0.010414913 0.008874618 0.005311199
## [16] 0.004348756 0.003554853 0.002200759 0.001872153 0.001626607
#scree plot
plot(prop_varex, xlab = "Principal Component",
     ylab = "Proportion of Variance Explained",
     type = "b")

#cumulative scree plot
plot(cumsum(prop_varex), xlab = "Principal Component",
     ylab = "Cumulative Proportion of Variance Explained",
     type = "b")



#Sample Code for Modification:
#Perform PCA -- Remember there can be no missing values
library(dplyr) #need to run to use the select function


 

keyPCAvars_new=prin_comp$x[,(1:15)] 

head(keyPCAvars_new) #check to make sure have the PC that you selected
##            PC1         PC2         PC3         PC4        PC5         PC6
## [1,] -6.048663  0.06392063 -0.52111938  0.61226269  0.8887188  0.09611968
## [2,] -2.306577  2.18140573 -0.04095126  0.04983085  1.2332492 -0.91691644
## [3,] -1.637678  0.40997114  0.43416473 -0.80399620 -0.8888893 -0.31661008
## [4,]  2.258904  2.60678628  0.21432680  0.11782464  0.7118478 -0.19680440
## [5,]  3.186172 -1.63831174  1.32049765 -0.75281208  3.1431048 -0.34143800
## [6,]  1.683914 -0.07209738  0.91569522 -2.19606991  0.3857232 -0.23670427
##             PC7        PC8        PC9        PC10       PC11        PC12
## [1,]  0.3648790  1.6778271 -1.6018135  0.65091428 -0.4005154 -0.45397028
## [2,] -0.2437272  0.5402516 -0.7652250 -0.36548628 -0.6980016 -0.41135035
## [3,] -0.5570262 -0.1358803  1.0128444 -0.26160118 -0.1814805  0.27006931
## [4,]  0.8539024 -0.9946885  0.5965281  0.33968933  0.5007011 -0.73847415
## [5,]  0.4407419 -0.8261437  0.8899254 -0.38373083  0.3540222 -0.22478513
## [6,] -0.6169786 -0.2463915  0.6686113 -0.06896418  0.6105050  0.04132167
##             PC13        PC14       PC15
## [1,] -0.38145987 -0.03803769  0.6438674
## [2,] -0.26719834  0.55632787 -0.1718497
## [3,]  0.01383782 -0.09282256 -0.5730750
## [4,]  0.47468628  0.02323714 -0.1869453
## [5,]  1.04362904 -0.18119195 -0.5719506
## [6,]  0.14771502  0.15257988  0.1653637
Crime_vs_PCAvars <- data.frame(cd2$robbbPerPop, keyPCAvars_new) #appending PC variables to original dataframe 

str(Crime_vs_PCAvars) #to confirm that the cluster assignments have been appended
## 'data.frame':    2214 obs. of  16 variables:
##  $ cd2.robbbPerPop: num  8.2 21.3 154.9 57.9 32 ...
##  $ PC1            : num  -6.05 -2.31 -1.64 2.26 3.19 ...
##  $ PC2            : num  0.0639 2.1814 0.41 2.6068 -1.6383 ...
##  $ PC3            : num  -0.521 -0.041 0.434 0.214 1.32 ...
##  $ PC4            : num  0.6123 0.0498 -0.804 0.1178 -0.7528 ...
##  $ PC5            : num  0.889 1.233 -0.889 0.712 3.143 ...
##  $ PC6            : num  0.0961 -0.9169 -0.3166 -0.1968 -0.3414 ...
##  $ PC7            : num  0.365 -0.244 -0.557 0.854 0.441 ...
##  $ PC8            : num  1.678 0.54 -0.136 -0.995 -0.826 ...
##  $ PC9            : num  -1.602 -0.765 1.013 0.597 0.89 ...
##  $ PC10           : num  0.651 -0.365 -0.262 0.34 -0.384 ...
##  $ PC11           : num  -0.401 -0.698 -0.181 0.501 0.354 ...
##  $ PC12           : num  -0.454 -0.411 0.27 -0.738 -0.225 ...
##  $ PC13           : num  -0.3815 -0.2672 0.0138 0.4747 1.0436 ...
##  $ PC14           : num  -0.038 0.5563 -0.0928 0.0232 -0.1812 ...
##  $ PC15           : num  0.644 -0.172 -0.573 -0.187 -0.572 ...
corr_df <- cor(Crime_vs_PCAvars) #correlation matrix to see which vars are correlated with salary

corr_df #prints correlation matrix
##                 cd2.robbbPerPop           PC1           PC2           PC3
## cd2.robbbPerPop      1.00000000  4.055421e-01 -1.304368e-01 -4.567068e-01
## PC1                  0.40554206  1.000000e+00  3.496673e-17  3.017358e-16
## PC2                 -0.13043681  3.496673e-17  1.000000e+00  1.694106e-16
## PC3                 -0.45670684  3.017358e-16  1.694106e-16  1.000000e+00
## PC4                  0.31283391  5.525851e-16 -2.754130e-16 -3.814113e-16
## PC5                 -0.02306963 -8.017401e-17 -6.554519e-16  3.665724e-17
## PC6                  0.23850997 -1.544453e-16  4.022108e-16  6.805828e-17
## PC7                 -0.17920554  7.909810e-18 -3.923315e-16  5.319084e-17
## PC8                  0.06102357 -4.550797e-16  1.881573e-16  8.427131e-16
## PC9                  0.01070444  7.100241e-16  3.844164e-16 -4.610575e-16
## PC10                -0.08074695  4.914855e-16  3.222127e-16  4.675904e-16
## PC11                 0.08363364 -5.029463e-16 -3.031920e-17 -8.956546e-17
## PC12                 0.10644314  8.209630e-16 -1.878157e-16 -7.468837e-17
## PC13                -0.06538260  9.745620e-16  9.635939e-16 -1.088160e-16
## PC14                -0.01767901 -4.880933e-16 -5.111330e-17  7.887764e-16
## PC15                -0.03725020 -5.667529e-16 -1.107999e-15 -3.737522e-17
##                           PC4           PC5           PC6           PC7
## cd2.robbbPerPop  3.128339e-01 -2.306963e-02  2.385100e-01 -1.792055e-01
## PC1              5.525851e-16 -8.017401e-17 -1.544453e-16  7.909810e-18
## PC2             -2.754130e-16 -6.554519e-16  4.022108e-16 -3.923315e-16
## PC3             -3.814113e-16  3.665724e-17  6.805828e-17  5.319084e-17
## PC4              1.000000e+00 -1.982472e-17 -1.322068e-15  1.090921e-15
## PC5             -1.982472e-17  1.000000e+00 -1.546357e-17  1.212065e-16
## PC6             -1.322068e-15 -1.546357e-17  1.000000e+00 -1.220096e-16
## PC7              1.090921e-15  1.212065e-16 -1.220096e-16  1.000000e+00
## PC8             -1.021859e-15 -3.082779e-16 -4.480198e-16 -6.316029e-16
## PC9              3.076937e-16  3.139589e-17  1.968417e-16 -4.183258e-16
## PC10            -4.927518e-16  5.393578e-16  8.672927e-17  3.700685e-16
## PC11             7.155934e-17 -1.252934e-16 -6.539622e-16  5.789101e-17
## PC12            -2.063056e-16  9.629591e-16 -1.197839e-15 -1.661184e-16
## PC13            -6.443009e-16  7.084131e-16  8.761966e-16 -2.098495e-16
## PC14            -8.299632e-17 -7.297101e-16  6.628259e-16  4.483261e-16
## PC15             4.178229e-16 -2.097534e-16  7.065229e-16 -3.129904e-16
##                           PC8           PC9          PC10          PC11
## cd2.robbbPerPop  6.102357e-02  1.070444e-02 -8.074695e-02  8.363364e-02
## PC1             -4.550797e-16  7.100241e-16  4.914855e-16 -5.029463e-16
## PC2              1.881573e-16  3.844164e-16  3.222127e-16 -3.031920e-17
## PC3              8.427131e-16 -4.610575e-16  4.675904e-16 -8.956546e-17
## PC4             -1.021859e-15  3.076937e-16 -4.927518e-16  7.155934e-17
## PC5             -3.082779e-16  3.139589e-17  5.393578e-16 -1.252934e-16
## PC6             -4.480198e-16  1.968417e-16  8.672927e-17 -6.539622e-16
## PC7             -6.316029e-16 -4.183258e-16  3.700685e-16  5.789101e-17
## PC8              1.000000e+00 -1.811550e-16  7.591187e-17 -1.257134e-15
## PC9             -1.811550e-16  1.000000e+00 -4.238343e-16  1.109276e-16
## PC10             7.591187e-17 -4.238343e-16  1.000000e+00 -2.003328e-17
## PC11            -1.257134e-15  1.109276e-16 -2.003328e-17  1.000000e+00
## PC12             2.080204e-16  3.276420e-16  8.668513e-16 -5.201015e-16
## PC13            -3.333472e-16 -1.455608e-16  5.820771e-19  8.343204e-16
## PC14            -5.152261e-16 -6.968134e-16  6.017865e-16  6.380302e-16
## PC15             1.300188e-15 -3.905049e-16 -7.047022e-16  2.721442e-16
##                          PC12          PC13          PC14          PC15
## cd2.robbbPerPop  1.064431e-01 -6.538260e-02 -1.767901e-02 -3.725020e-02
## PC1              8.209630e-16  9.745620e-16 -4.880933e-16 -5.667529e-16
## PC2             -1.878157e-16  9.635939e-16 -5.111330e-17 -1.107999e-15
## PC3             -7.468837e-17 -1.088160e-16  7.887764e-16 -3.737522e-17
## PC4             -2.063056e-16 -6.443009e-16 -8.299632e-17  4.178229e-16
## PC5              9.629591e-16  7.084131e-16 -7.297101e-16 -2.097534e-16
## PC6             -1.197839e-15  8.761966e-16  6.628259e-16  7.065229e-16
## PC7             -1.661184e-16 -2.098495e-16  4.483261e-16 -3.129904e-16
## PC8              2.080204e-16 -3.333472e-16 -5.152261e-16  1.300188e-15
## PC9              3.276420e-16 -1.455608e-16 -6.968134e-16 -3.905049e-16
## PC10             8.668513e-16  5.820771e-19  6.017865e-16 -7.047022e-16
## PC11            -5.201015e-16  8.343204e-16  6.380302e-16  2.721442e-16
## PC12             1.000000e+00  1.753261e-15 -1.442269e-15  4.167717e-16
## PC13             1.753261e-15  1.000000e+00 -6.837209e-16 -1.026370e-15
## PC14            -1.442269e-15 -6.837209e-16  1.000000e+00 -7.171544e-16
## PC15             4.167717e-16 -1.026370e-15 -7.171544e-16  1.000000e+00
#make a correlation heatmap
library(ggplot2)

library(reshape2)  
melted_corr <- melt(corr_df)
ggplot(data = melted_corr, aes(x=Var1, y=Var2, fill=value)) + geom_tile() + scale_fill_gradient(low="yellow", high="blue")