# read the human data
human <- read.table("http://s3.amazonaws.com/assets.datacamp.com/production/course_2218/datasets/human1.txt", sep =",", header = T)
# look at the (column) names of human
names(human)
## [1] "HDI.Rank" "Country" "HDI" "Life.Exp"
## [5] "Edu.Exp" "Edu.Mean" "GNI" "GNI.Minus.Rank"
## [9] "GII.Rank" "GII" "Mat.Mor" "Ado.Birth"
## [13] "Parli.F" "Edu2.F" "Edu2.M" "Labo.F"
## [17] "Labo.M" "Edu2.FM" "Labo.FM"
# look at the structure of human
str(human)
## 'data.frame': 195 obs. of 19 variables:
## $ HDI.Rank : int 1 2 3 4 5 6 6 8 9 9 ...
## $ Country : Factor w/ 195 levels "Afghanistan",..: 129 10 169 48 124 67 84 186 34 125 ...
## $ HDI : num 0.944 0.935 0.93 0.923 0.922 0.916 0.916 0.915 0.913 0.913 ...
## $ Life.Exp : num 81.6 82.4 83 80.2 81.6 80.9 80.9 79.1 82 81.8 ...
## $ Edu.Exp : num 17.5 20.2 15.8 18.7 17.9 16.5 18.6 16.5 15.9 19.2 ...
## $ Edu.Mean : num 12.6 13 12.8 12.7 11.9 13.1 12.2 12.9 13 12.5 ...
## $ GNI : Factor w/ 194 levels "1,096","1,123",..: 166 135 156 139 140 137 127 154 134 117 ...
## $ GNI.Minus.Rank: int 5 17 6 11 9 11 16 3 11 23 ...
## $ GII.Rank : int 1 2 3 4 5 6 6 8 9 9 ...
## $ GII : num 0.067 0.11 0.028 0.048 0.062 0.041 0.113 0.28 0.129 0.157 ...
## $ Mat.Mor : int 4 6 6 5 6 7 9 28 11 8 ...
## $ Ado.Birth : num 7.8 12.1 1.9 5.1 6.2 3.8 8.2 31 14.5 25.3 ...
## $ Parli.F : num 39.6 30.5 28.5 38 36.9 36.9 19.9 19.4 28.2 31.4 ...
## $ Edu2.F : num 97.4 94.3 95 95.5 87.7 96.3 80.5 95.1 100 95 ...
## $ Edu2.M : num 96.7 94.6 96.6 96.6 90.5 97 78.6 94.8 100 95.3 ...
## $ Labo.F : num 61.2 58.8 61.8 58.7 58.5 53.6 53.1 56.3 61.6 62 ...
## $ Labo.M : num 68.7 71.8 74.9 66.4 70.6 66.4 68.1 68.9 71 73.8 ...
## $ Edu2.FM : num 1.007 0.997 0.983 0.989 0.969 ...
## $ Labo.FM : num 0.891 0.819 0.825 0.884 0.829 ...
# access the stringr package
library("stringr")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("GGally")
## Warning: package 'GGally' was built under R version 3.6.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library("corrplot")
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
# look at the structure of the GNI column in 'human'
str(human$GNI)
## Factor w/ 194 levels "1,096","1,123",..: 166 135 156 139 140 137 127 154 134 117 ...
# remove the commas from GNI and print out a numeric version of it
human$GNI <- str_replace(human$GNI, pattern=",", replace ="") %>% as.numeric(human$GNI)
# columns to keep
keep <- c("Country", "Edu2.FM", "Labo.FM", "Life.Exp", "Edu.Exp", "GNI", "Mat.Mor", "Ado.Birth", "Parli.F")
# select the 'keep' columns
human <- select(human, one_of(keep))
# print out a completeness indicator of the 'human' data
complete.cases(human)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [13] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [25] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE
## [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [49] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## [61] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [73] TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [85] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
## [97] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [109] FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [121] TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE
## [145] TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE TRUE
## [157] FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [169] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
## [181] TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [193] TRUE TRUE TRUE
# print out the data along with a completeness indicator as the last column
data.frame(human[-1], comp = complete.cases(human))
## Edu2.FM Labo.FM Life.Exp Edu.Exp GNI Mat.Mor Ado.Birth Parli.F comp
## 1 1.0072389 0.8908297 81.6 17.5 64992 4 7.8 39.6 TRUE
## 2 0.9968288 0.8189415 82.4 20.2 42261 6 12.1 30.5 TRUE
## 3 0.9834369 0.8251001 83.0 15.8 56431 6 1.9 28.5 TRUE
## 4 0.9886128 0.8840361 80.2 18.7 44025 5 5.1 38.0 TRUE
## 5 0.9690608 0.8286119 81.6 17.9 45435 6 6.2 36.9 TRUE
## 6 0.9927835 0.8072289 80.9 16.5 43919 7 3.8 36.9 TRUE
## 7 1.0241730 0.7797357 80.9 18.6 39568 9 8.2 19.9 TRUE
## 8 1.0031646 0.8171263 79.1 16.5 52947 28 31.0 19.4 TRUE
## 9 1.0000000 0.8676056 82.0 15.9 42155 11 14.5 28.2 TRUE
## 10 0.9968520 0.8401084 81.8 19.2 32689 8 25.3 31.4 TRUE
## 11 0.9148148 0.7616580 83.0 15.4 76628 6 6.0 25.3 TRUE
## 12 0.9116162 0.7566372 84.0 15.6 53959 NA 3.3 NA FALSE
## 13 NA NA 80.0 15.0 79851 NA NA 20.0 FALSE
## 14 0.9908362 0.8880707 82.2 15.8 45636 4 6.5 43.6 TRUE
## 15 0.9989990 0.8107715 80.7 16.2 39267 8 25.8 23.5 TRUE
## 16 0.9934498 0.9108527 82.6 19.0 35182 4 11.5 41.3 TRUE
## 17 0.8641975 0.6948682 81.9 16.9 33890 27 2.2 16.3 TRUE
## 18 0.9667812 0.8379161 82.4 16.0 30676 2 7.8 22.5 TRUE
## 19 1.0000000 0.7848297 81.7 13.9 58711 11 8.3 28.3 TRUE
## 20 1.0139860 0.6931818 83.5 15.3 36927 6 5.4 11.6 TRUE
## 21 0.9348613 0.8010118 80.8 16.3 41187 6 6.7 42.4 TRUE
## 22 0.9375000 0.8230519 82.2 16.0 38056 12 5.7 25.7 TRUE
## 23 1.0000000 0.8064993 81.4 15.7 43869 4 4.1 30.3 TRUE
## 24 1.0000000 0.8703125 80.8 17.1 38695 4 9.2 42.5 TRUE
## 25 0.9775510 0.8275316 80.4 16.8 27852 7 0.6 27.7 TRUE
## 26 0.9138167 0.7978723 82.6 17.3 32045 4 10.6 38.0 TRUE
## 27 0.8844720 0.6655462 83.1 16.0 33030 4 4.0 30.1 TRUE
## 28 1.0020060 0.7481698 78.6 16.4 26660 5 4.9 18.9 TRUE
## 29 0.8880597 0.7072000 80.9 17.6 24524 5 11.9 21.0 TRUE
## 30 1.0000000 0.8156749 76.8 16.5 25214 11 16.8 19.8 TRUE
## 31 0.9424779 0.6985392 78.8 14.5 72570 27 23.0 NA FALSE
## 32 0.9302326 0.7876231 80.2 14.0 28633 10 5.5 12.5 TRUE
## 33 1.1305085 0.5319372 78.2 13.8 123124 6 9.5 0.0 TRUE
## 34 1.0040568 NA 81.3 13.5 43978 NA NA 50.0 FALSE
## 35 0.9959799 0.7448980 76.3 15.1 25845 7 15.9 18.7 TRUE
## 36 0.9286550 0.7534669 77.4 15.5 23177 3 12.2 22.1 TRUE
## 37 0.9448568 0.8291233 73.3 16.4 24500 11 10.6 23.4 TRUE
## 38 0.8772379 0.5716440 80.6 14.4 27930 9 18.2 13.0 TRUE
## 39 0.8605974 0.2579821 74.3 16.3 52821 16 10.2 19.9 TRUE
## 40 0.9774306 0.6333333 76.3 17.9 22050 69 54.4 36.8 TRUE
## 41 1.1944444 0.5054348 77.0 13.3 60868 8 27.6 17.5 TRUE
## 42 0.9594241 0.6577540 81.7 15.2 21290 22 55.3 15.8 TRUE
## 43 0.9896266 0.8293051 80.9 16.3 25757 8 12.6 31.3 TRUE
## 44 0.9918946 0.7466667 75.2 15.4 22916 14 12.1 10.1 TRUE
## 45 1.1031128 0.4510932 76.6 14.4 38599 22 13.8 15.0 TRUE
## 46 0.9989899 0.8121302 74.2 15.2 22281 13 13.5 18.0 TRUE
## 47 0.9081197 0.7654110 77.3 14.8 19409 13 12.7 25.8 TRUE
## 48 0.9875666 0.5246691 74.4 14.7 83961 14 14.5 1.5 TRUE
## 49 0.8891235 0.7504363 76.2 15.2 14558 7 15.2 17.3 TRUE
## 50 0.9436009 0.7939778 71.3 15.7 16676 1 20.6 30.1 TRUE
## 51 0.9686486 0.7963738 70.1 14.7 22352 24 25.7 14.5 TRUE
## 52 0.8266200 0.3510896 76.8 13.6 34858 11 10.6 9.6 TRUE
## 53 0.9358696 0.7503852 74.7 14.2 18108 33 31.0 12.0 TRUE
## 54 1.0815109 0.7239583 77.2 15.5 19283 14 58.3 11.5 TRUE
## 55 1.0410959 0.8738966 75.4 12.6 21336 37 28.5 16.7 TRUE
## 56 0.9645749 0.8690629 69.4 15.0 20867 26 29.9 20.1 TRUE
## 57 1.0205245 0.8603133 75.6 15.4 12488 52 48.4 19.6 TRUE
## 58 NA NA 76.1 14.0 20070 NA 49.3 25.7 FALSE
## 59 0.9717868 0.8118644 74.2 14.4 15596 5 35.9 20.4 TRUE
## 60 NA NA 72.7 13.7 13496 NA NA 10.3 FALSE
## 61 1.0821643 0.5990220 77.6 13.3 18192 85 78.5 19.3 TRUE
## 62 0.9130435 0.5880795 74.7 12.7 22762 29 5.7 14.2 TRUE
## 63 0.8517241 0.5876011 74.4 15.6 17470 73 30.9 11.6 TRUE
## 64 1.0045045 NA 73.1 13.4 23300 NA 56.3 43.8 FALSE
## 65 0.9802956 0.7019868 70.4 12.3 26090 84 34.8 24.7 TRUE
## 66 0.7934783 0.7307061 74.9 14.4 12190 16 16.9 34.0 TRUE
## 67 0.9428934 0.6200000 79.4 13.8 7301 80 43.1 48.9 TRUE
## 68 0.9566787 0.3286319 79.3 13.8 16509 16 12.0 3.1 TRUE
## 69 1.0039604 0.5898734 79.4 13.9 13413 38 60.8 33.3 TRUE
## 70 0.9201183 0.2255435 75.4 15.1 15440 23 31.6 3.1 TRUE
## 71 1.1141732 0.6452020 74.2 14.2 16159 110 83.2 17.0 TRUE
## 72 0.6500000 0.4152542 75.3 14.5 18677 20 30.9 14.4 TRUE
## 73 0.9515707 0.4600262 74.9 13.7 9779 29 16.9 5.8 TRUE
## 74 0.9191419 0.5644556 76.8 13.1 16056 49 63.4 37.1 TRUE
## 75 1.0419847 0.7351485 74.5 15.2 15175 69 70.8 9.6 TRUE
## 76 0.9676375 0.7523302 74.9 13.8 7164 41 46.8 11.3 TRUE
## 77 NA NA 73.8 12.9 20805 NA NA 6.7 FALSE
## 78 0.9620123 0.9037356 70.8 11.9 16428 26 40.0 15.6 TRUE
## 79 NA NA 73.4 15.8 10939 23 35.4 25.0 FALSE
## 80 0.8853503 0.2342342 74.0 13.5 11365 50 26.5 11.6 TRUE
## 81 0.7230216 0.6385185 75.4 13.4 11780 7 18.3 33.3 TRUE
## 82 0.9562044 0.7952167 71.0 15.1 8178 23 25.7 11.8 TRUE
## 83 0.8612903 0.2105263 74.8 14.0 13054 89 10.0 25.7 TRUE
## 84 0.8517398 0.8080569 74.6 13.1 11015 89 50.7 22.3 TRUE
## 85 0.9306030 0.6854962 77.8 11.8 9943 21 15.3 20.7 TRUE
## 86 0.9894737 0.7465565 74.7 12.3 8124 29 27.1 10.7 TRUE
## 87 0.6432665 0.5951134 76.5 13.6 9638 8 15.1 19.3 TRUE
## 88 1.0177665 0.6614268 75.9 14.2 10605 87 77.0 41.6 TRUE
## 89 NA 0.8228346 75.1 12.6 9765 34 56.3 20.7 FALSE
## 90 0.8164117 0.8160920 75.8 13.1 12547 32 8.6 23.6 TRUE
## 91 0.9953488 0.5208333 70.0 15.7 7493 59 42.8 14.0 TRUE
## 92 1.0142687 0.8167388 69.4 14.6 10729 68 18.7 14.9 TRUE
## 93 0.8750000 0.7967782 74.4 13.5 13323 26 41.0 6.1 TRUE
## 94 1.2801724 NA 77.8 12.7 9994 NA NA 21.9 FALSE
## 95 1.3245823 0.3926702 71.6 14.0 14911 15 2.5 16.0 TRUE
## 96 0.7114967 0.3540197 74.8 14.6 10404 46 4.6 31.3 TRUE
## 97 1.0233813 0.7001255 74.0 13.5 12040 83 68.5 20.9 TRUE
## 98 NA 0.7141026 72.9 13.4 9937 45 54.5 13.0 FALSE
## 99 1.0541311 0.7912553 75.7 12.4 7415 80 70.1 16.7 TRUE
## 100 0.9909400 0.7171582 72.8 14.7 5069 120 18.1 0.0 TRUE
## 101 1.0079156 0.5978129 70.0 13.6 7614 45 71.4 13.3 TRUE
## 102 1.0470810 0.6526718 73.5 13.1 11883 100 99.6 19.1 TRUE
## 103 0.9469214 0.5886628 71.1 12.7 15617 130 35.2 11.8 TRUE
## 104 0.8348624 0.7251613 76.8 13.0 12328 31 4.2 5.9 TRUE
## 105 1.0716667 0.4023973 73.4 12.9 5327 58 28.3 6.1 TRUE
## 106 0.9448010 0.8811275 64.5 12.5 16646 170 44.2 9.5 TRUE
## 107 0.9689441 0.8506787 71.6 11.9 5223 21 29.3 20.8 TRUE
## 108 0.7244224 0.3168449 71.1 13.5 10512 45 43.0 2.2 TRUE
## 109 NA 0.6098830 65.6 10.8 13066 61 18.0 25.8 FALSE
## 110 1.4930748 0.8593272 64.4 12.5 16367 240 103.0 16.2 TRUE
## 111 0.8109756 0.6104513 68.9 13.0 9788 190 48.3 17.1 TRUE
## 112 0.8558140 0.6568396 72.9 11.9 7643 110 67.0 16.8 TRUE
## 113 0.9074074 0.2319277 72.9 13.0 4699 NA 45.8 NA FALSE
## 114 NA 0.6362434 68.4 11.5 5567 36 38.8 16.4 FALSE
## 115 1.0345369 0.6411543 68.2 11.3 7915 120 46.8 27.1 TRUE
## 116 0.8440367 0.6050633 73.0 12.3 7349 69 76.0 27.4 TRUE
## 117 0.9578393 0.7355372 57.4 13.6 12122 140 50.9 40.7 TRUE
## 118 0.8342697 0.8880779 75.8 11.9 5092 49 29.0 24.3 TRUE
## 119 0.8054146 0.7935723 68.3 13.2 5760 200 71.9 51.8 TRUE
## 120 0.9762397 0.7044025 70.6 12.5 3044 75 29.3 23.3 TRUE
## 121 0.5537849 0.2134670 69.4 10.1 14003 67 68.7 26.5 TRUE
## 122 NA 0.6152927 73.3 13.5 6094 53 70.6 20.8 FALSE
## 123 NA NA 69.1 11.7 3432 96 18.6 0.0 FALSE
## 124 1.2615063 0.5291925 66.4 10.3 6522 250 88.5 31.3 TRUE
## 125 1.0287206 0.5902864 74.9 11.5 4457 100 100.8 39.1 TRUE
## 126 0.6854305 0.3496042 74.0 11.6 6850 120 35.8 11.0 TRUE
## 127 0.9680233 0.8587127 64.8 11.3 9418 130 54.9 37.7 TRUE
## 128 0.9439655 0.5589569 71.8 10.7 6929 140 97.2 13.3 TRUE
## 129 1.0427632 0.7639429 69.4 11.2 2517 44 42.8 15.2 TRUE
## 130 0.4770318 0.3379224 68.0 11.7 5497 190 32.8 12.2 TRUE
## 131 1.0852713 0.5162847 73.1 11.1 3938 120 84.0 25.8 TRUE
## 132 0.9855072 0.8639896 69.5 12.6 7176 120 40.9 8.3 TRUE
## 133 NA 0.4842520 68.2 11.7 5363 270 52.2 38.5 FALSE
## 134 0.7283951 0.1856946 69.6 12.3 2728 49 41.6 12.4 TRUE
## 135 NA 0.7687500 71.9 10.6 2803 86 44.8 0.0 FALSE
## 136 0.8446809 0.9383562 62.3 11.1 6012 410 126.7 11.5 TRUE
## 137 NA NA 66.0 12.3 2434 130 16.6 8.7 FALSE
## 138 NA 0.8752711 57.6 9.0 21056 290 112.6 19.7 FALSE
## 139 0.5863636 0.8539720 60.1 13.5 3734 280 125.4 12.7 TRUE
## 140 0.6986090 0.9425770 61.4 11.5 3852 380 58.4 10.9 TRUE
## 141 0.6189189 0.9646018 66.2 10.6 4680 NA 65.0 25.0 FALSE
## 142 0.8256659 0.6825208 71.6 10.0 3191 170 80.6 20.0 TRUE
## 143 0.4323144 0.9109827 68.4 10.9 2949 170 44.3 19.0 TRUE
## 144 NA 0.5822622 66.5 11.3 2918 210 65.1 18.2 FALSE
## 145 0.8057325 0.8591160 61.6 11.0 2762 400 93.6 20.8 TRUE
## 146 0.4633508 0.9173364 69.6 12.4 2311 190 73.7 29.5 TRUE
## 147 0.4186551 0.2967431 66.2 7.8 4866 170 27.3 19.7 TRUE
## 148 1.4967320 0.9137303 65.9 8.6 4608 200 12.1 4.7 TRUE
## 149 NA 0.8231469 52.3 11.4 6822 460 170.2 36.8 FALSE
## 150 0.8423077 0.6131285 49.0 11.3 5542 310 72.0 14.7 TRUE
## 151 0.5894737 0.9767184 65.0 9.2 2411 410 122.7 36.0 TRUE
## 152 NA 0.7566719 52.8 9.0 5341 560 119.6 6.6 FALSE
## 153 0.6103152 0.8307292 55.5 10.4 2803 590 115.8 27.1 TRUE
## 154 NA 0.9569061 65.1 10.3 1328 440 122.8 20.5 FALSE
## 155 0.7854839 0.9275362 57.5 10.9 1615 470 60.3 35.1 TRUE
## 156 0.3971292 0.3628319 63.1 8.5 3560 320 73.3 22.2 TRUE
## 157 NA 0.6759494 67.9 9.2 1540 130 64.9 2.0 FALSE
## 158 0.5241379 0.9527027 62.6 9.9 2463 220 62.1 2.7 TRUE
## 159 NA 0.4394507 63.3 11.5 1456 350 51.1 3.0 FALSE
## 160 0.3220974 0.3518006 63.8 9.2 3519 270 47.0 0.7 TRUE
## 161 1.1526316 0.8027211 49.8 11.1 3306 490 89.4 26.8 TRUE
## 162 0.3995037 0.9913899 59.7 12.2 1228 450 91.5 17.6 TRUE
## 163 0.6363636 0.8577465 62.8 8.7 1669 380 42.0 3.5 TRUE
## 164 0.9090909 1.0128957 64.2 10.3 1458 320 33.6 57.5 TRUE
## 165 0.6835821 0.9570707 58.5 9.8 1613 360 126.6 35.0 TRUE
## 166 0.4185185 0.8633461 59.6 11.1 1767 340 90.2 8.4 TRUE
## 167 0.6648352 0.4118421 63.5 7.0 3809 360 84.0 23.8 TRUE
## 168 NA 0.5361891 62.0 6.4 3276 230 18.6 12.7 FALSE
## 169 NA NA 55.7 7.6 2332 730 75.3 24.3 FALSE
## 170 0.4675325 0.7500000 66.5 7.9 2188 320 94.4 42.7 TRUE
## 171 0.1979866 0.1987421 60.4 9.3 1885 400 86.8 27.6 TRUE
## 172 0.4651163 0.6437346 51.5 8.9 3171 720 130.3 9.2 TRUE
## 173 0.5138889 1.0380368 62.8 10.8 747 510 144.8 16.7 TRUE
## 174 0.4285714 0.8756999 64.1 8.5 1428 420 78.4 25.5 TRUE
## 175 0.5523810 0.8709288 60.2 8.8 1507 430 115.8 9.4 TRUE
## 176 0.3950617 0.9658470 58.7 9.8 680 730 135.3 8.2 TRUE
## 177 0.3918575 0.8981481 60.9 9.5 805 640 117.4 10.7 TRUE
## 178 NA 0.8687898 55.2 9.0 1362 560 99.3 13.7 FALSE
## 179 0.5099338 0.6240786 58.0 8.4 1583 550 175.6 9.5 TRUE
## 180 0.2258065 1.0326087 55.1 9.3 1123 480 137.8 39.6 TRUE
## 181 0.4608295 0.9521739 50.9 8.6 1780 1100 100.7 12.4 TRUE
## 182 NA 0.8378033 58.8 8.7 1096 650 131.0 21.9 FALSE
## 183 0.2812500 0.8566667 58.7 7.8 1591 400 115.4 13.3 TRUE
## 184 0.6385542 1.0158537 56.7 10.1 758 740 30.3 34.9 TRUE
## 185 0.1717172 0.8080808 51.6 7.4 2085 980 152.0 14.9 TRUE
## 186 NA 0.8908686 63.7 4.1 1130 380 65.3 22.0 FALSE
## 187 0.3782772 0.8531140 50.7 7.2 581 880 98.3 12.5 TRUE
## 188 0.3076923 0.4459309 61.4 5.4 908 630 204.8 13.3 TRUE
## 189 0.7289916 0.3081009 70.6 12.0 15722 155 45.4 14.0 TRUE
## 190 0.8250377 0.7884131 74.0 12.7 11449 72 21.2 18.7 TRUE
## 191 0.8784119 0.6514286 72.3 13.6 12791 28 30.8 19.0 TRUE
## 192 0.9836957 0.6729323 75.0 14.0 14242 85 68.3 27.0 TRUE
## 193 0.5329670 0.3711083 68.4 11.2 5605 183 38.7 17.5 TRUE
## 194 0.7015873 0.8537859 58.5 9.6 3363 506 109.7 22.5 TRUE
## 195 0.8333333 0.6558018 71.5 12.2 14301 210 47.4 21.8 TRUE
# filter out all rows with NA values
human <- filter(human, complete.cases(human))
# human without NA is available
# look at the last 10 observations
tail(human, 10)
## Country Edu2.FM Labo.FM Life.Exp Edu.Exp GNI
## 153 Chad 0.1717172 0.8080808 51.6 7.4 2085
## 154 Central African Republic 0.3782772 0.8531140 50.7 7.2 581
## 155 Niger 0.3076923 0.4459309 61.4 5.4 908
## 156 Arab States 0.7289916 0.3081009 70.6 12.0 15722
## 157 East Asia and the Pacific 0.8250377 0.7884131 74.0 12.7 11449
## 158 Europe and Central Asia 0.8784119 0.6514286 72.3 13.6 12791
## 159 Latin America and the Caribbean 0.9836957 0.6729323 75.0 14.0 14242
## 160 South Asia 0.5329670 0.3711083 68.4 11.2 5605
## 161 Sub-Saharan Africa 0.7015873 0.8537859 58.5 9.6 3363
## 162 World 0.8333333 0.6558018 71.5 12.2 14301
## Mat.Mor Ado.Birth Parli.F
## 153 980 152.0 14.9
## 154 880 98.3 12.5
## 155 630 204.8 13.3
## 156 155 45.4 14.0
## 157 72 21.2 18.7
## 158 28 30.8 19.0
## 159 85 68.3 27.0
## 160 183 38.7 17.5
## 161 506 109.7 22.5
## 162 210 47.4 21.8
# last indice we want to keep
last <- nrow(human) - 7
# choose everything until the last 7 observations
human <- human[1:last, ]
# add countries as rownames
rownames(human) <- human$Country
# remove the Country variable
human <- select(human, -Country)
head(human)
## Edu2.FM Labo.FM Life.Exp Edu.Exp GNI Mat.Mor Ado.Birth
## Norway 1.0072389 0.8908297 81.6 17.5 64992 4 7.8
## Australia 0.9968288 0.8189415 82.4 20.2 42261 6 12.1
## Switzerland 0.9834369 0.8251001 83.0 15.8 56431 6 1.9
## Denmark 0.9886128 0.8840361 80.2 18.7 44025 5 5.1
## Netherlands 0.9690608 0.8286119 81.6 17.9 45435 6 6.2
## Germany 0.9927835 0.8072289 80.9 16.5 43919 7 3.8
## Parli.F
## Norway 39.6
## Australia 30.5
## Switzerland 28.5
## Denmark 38.0
## Netherlands 36.9
## Germany 36.9
# visualize the 'human_' variables
ggpairs(human)
# compute the correlation matrix and visualize it with corrplot
cor(human)
## Edu2.FM Labo.FM Life.Exp Edu.Exp GNI
## Edu2.FM 1.000000000 0.009564039 0.5760299 0.59325156 0.43030485
## Labo.FM 0.009564039 1.000000000 -0.1400125 0.04732183 -0.02173971
## Life.Exp 0.576029853 -0.140012504 1.0000000 0.78943917 0.62666411
## Edu.Exp 0.593251562 0.047321827 0.7894392 1.00000000 0.62433940
## GNI 0.430304846 -0.021739705 0.6266641 0.62433940 1.00000000
## Mat.Mor -0.660931770 0.240461075 -0.8571684 -0.73570257 -0.49516234
## Ado.Birth -0.529418415 0.120158862 -0.7291774 -0.70356489 -0.55656208
## Parli.F 0.078635285 0.250232608 0.1700863 0.20608156 0.08920818
## Mat.Mor Ado.Birth Parli.F
## Edu2.FM -0.6609318 -0.5294184 0.07863528
## Labo.FM 0.2404611 0.1201589 0.25023261
## Life.Exp -0.8571684 -0.7291774 0.17008631
## Edu.Exp -0.7357026 -0.7035649 0.20608156
## GNI -0.4951623 -0.5565621 0.08920818
## Mat.Mor 1.0000000 0.7586615 -0.08944000
## Ado.Birth 0.7586615 1.0000000 -0.07087810
## Parli.F -0.0894400 -0.0708781 1.00000000
cor(human)%>%corrplot()
From the correlation plot, we can see the correlation of each variables with itself and the other variables. As we can see Edu.Exp shows high correlation with Life.Exp whereas Life.Exp shows very less correlation with Labo.FM
# standardize the variables
human_std <- scale(human)
#human_std
# print out summaries of the standardized variables
summary(human_std)
## Edu2.FM Labo.FM Life.Exp Edu.Exp
## Min. :-2.8189 Min. :-2.6247 Min. :-2.7188 Min. :-2.7378
## 1st Qu.:-0.5233 1st Qu.:-0.5484 1st Qu.:-0.6425 1st Qu.:-0.6782
## Median : 0.3503 Median : 0.2316 Median : 0.3056 Median : 0.1140
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.5958 3rd Qu.: 0.7350 3rd Qu.: 0.6717 3rd Qu.: 0.7126
## Max. : 2.6646 Max. : 1.6632 Max. : 1.4218 Max. : 2.4730
## GNI Mat.Mor Ado.Birth Parli.F
## Min. :-0.9193 Min. :-0.6992 Min. :-1.1325 Min. :-1.8203
## 1st Qu.:-0.7243 1st Qu.:-0.6496 1st Qu.:-0.8394 1st Qu.:-0.7409
## Median :-0.3013 Median :-0.4726 Median :-0.3298 Median :-0.1403
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.3712 3rd Qu.: 0.1932 3rd Qu.: 0.6030 3rd Qu.: 0.6127
## Max. : 5.6890 Max. : 4.4899 Max. : 3.8344 Max. : 3.1850
# perform principal component analysis (with the SVD method)
pca_human <- prcomp(human_std)
# draw a biplot of the principal component representation and the original variables
biplot(pca_human, choices = 1:2, cex = c(0.8, 1), col = c("grey40", "deeppink2"))
Using biplot, we can interpret the correlations. The arrows in pink color are drawn to visualize the connections between the original features and the PC’s. If you look at the ‘Labo.Fm’ arrow, it points towards positive values of PC1 and Edu.EXP points towards positive value of PC2.
# create and print out a summary of pca_human
s <- summary(pca_human)
s
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.0708 1.1397 0.87505 0.77886 0.66196 0.53631 0.45900
## Proportion of Variance 0.5361 0.1624 0.09571 0.07583 0.05477 0.03595 0.02634
## Cumulative Proportion 0.5361 0.6984 0.79413 0.86996 0.92473 0.96069 0.98702
## PC8
## Standard deviation 0.32224
## Proportion of Variance 0.01298
## Cumulative Proportion 1.00000
# rounded percentages of variance captured by each PC
pca_pr <- round(100*s$importance[2,], digits = 1)
# print out the percentages of variance
pca_pr
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## 53.6 16.2 9.6 7.6 5.5 3.6 2.6 1.3
# create object pc_lab to be used as axis labels
pc_lab <- paste0(names(pca_pr), " (", pca_pr, "%)")
# draw a biplot
biplot(pca_human, cex = c(0.8, 1), col = c("grey40", "deeppink2"), xlab = pc_lab[1], ylab = pc_lab[2])
biplot(pca_human, cex = c(0.9, 1), col = c("grey40", "deeppink2"), xlab = pc_lab[1], ylab = pc_lab[2])
PC1 explains 53.6% of the total variance, making it a fairly good summary measure.Whereas, PC2 explaining 16.2% of the variance. Also, we can see high values of PC1 are associated with positive values of almost all the variables with standard deviation of approximately 2.07.