Is there a group of factors that influence the amount of financial aid given out?
Increases financial stability and reputation.
table(college$level)
##
## 2-year 4-year
## 1459 2339
table(college$control)
##
## Private for-profit Private not-for-profit Public
## 992 1248 1558
keeps <- c("level", "control", "hbcu", "flagship", "student_count", "aid_value", "pell_value")
college <- college[keeps]
str(college)
## tibble [3,798 × 7] (S3: tbl_df/tbl/data.frame)
## $ level : Factor w/ 2 levels "2-year","4-year": 2 2 2 2 2 2 1 2 2 2 ...
## $ control : Factor w/ 3 levels "Private for-profit",..: 3 3 2 3 3 3 3 3 3 2 ...
## $ hbcu : num [1:3798] 1 0 0 0 1 0 0 0 0 0 ...
## $ flagship : num [1:3798] 0 0 0 0 0 1 0 0 0 0 ...
## $ student_count: num [1:3798] 4051 11502 322 5696 5356 ...
## $ aid_value : num [1:3798] 7142 6088 2540 6647 7256 ...
## $ pell_value : num [1:3798] 71.2 35.1 68.4 32.8 82.7 21.1 65.1 40.1 16.9 21.4 ...
sum(is.na(college$aid_value))
## [1] 1
college <- college[complete.cases(college$aid_value),]
sum(is.na(college$aid_value))
## [1] 0
# Let's look at min-max scaling, placing the numbers between 0 and 1.
###Build our own normalizer, which is maybe how I would go if given the option. If you need to do multiple columns use lapply. See this referred to as a min-max scaler function.
normalize <- function(x){
# x is a numeric vector because the functions min and max require
#numeric inputs
(x - min(x)) / (max(x) - min(x))#numerator subtracts the minimum value of x from the entire column, denominator essentially calculates the range of x
}
(pellvalue_n <- normalize(college$pell_value))
## [1] 0.712 0.351 0.684 0.328 0.827 0.211 0.651 0.401 0.169 0.214 0.612 0.836
## [13] 0.713 0.512 0.489 0.582 0.600 0.555 0.502 0.769 0.637 0.430 0.587 0.461
## [25] 0.528 0.358 0.403 0.479 0.682 0.560 0.575 0.442 0.823 0.503 0.414 0.523
## [37] 0.419 0.628 0.507 0.560 0.645 0.701 0.635 0.149 0.750 0.522 0.574 0.374
## [49] 0.333 0.609 0.743 0.966 0.610 0.476 0.645 0.584 0.247 0.235 0.183 0.279
## [61] 0.660 0.209 0.826 0.726 0.730 0.651 0.843 0.661 0.338 0.580 0.327 0.531
## [73] 0.464 0.835 0.308 0.198 0.288 0.381 0.526 0.779 0.515 0.337 0.373 0.730
## [85] 0.680 0.477 0.822 0.378 0.294 0.473 0.524 0.327 0.490 0.618 0.455 0.645
## [97] 0.256 0.220 0.505 0.328 0.481 0.386 0.314 0.403 0.711 0.462 0.243 0.699
## [109] 0.316 0.462 0.456 0.647 0.641 0.394 0.453 0.382 0.443 0.416 0.603 0.633
## [121] 0.263 0.530 0.179 0.318 0.582 0.502 0.615 0.270 0.370 0.709 0.391 0.672
## [133] 0.728 0.396 0.586 0.574 0.619 0.389 0.527 0.510 0.515 0.322 0.505 0.424
## [145] 0.662 0.220 0.242 0.416 0.706 0.286 0.511 0.369 0.286 0.390 0.494 0.750
## [157] 0.310 0.428 0.589 0.493 0.248 0.478 0.324 0.106 0.258 0.198 0.323 0.584
## [169] 0.577 0.580 0.440 0.389 0.586 0.541 0.406 0.443 0.473 0.645 0.485 0.505
## [181] 0.324 0.430 0.432 0.362 0.564 0.427 0.380 0.446 0.716 0.407 0.334 0.376
## [193] 0.157 0.228 0.706 0.471 0.176 0.255 0.368 0.203 0.519 0.250 0.647 0.325
## [205] 0.258 0.116 0.175 0.399 0.487 0.288 0.485 0.227 0.286 0.243 0.202 0.168
## [217] 0.260 0.265 0.159 0.454 0.578 0.139 0.323 0.273 0.258 0.319 0.315 0.318
## [229] 0.353 0.464 0.321 0.086 0.382 0.576 0.249 0.233 0.410 0.277 0.228 0.323
## [241] 0.131 0.789 0.889 0.759 0.737 0.761 0.462 0.516 0.755 0.562 0.125 0.606
## [253] 0.761 0.724 0.307 0.313 0.394 0.618 0.465 0.411 0.224 0.226 0.201 0.368
## [265] 0.482 0.440 0.348 0.337 0.378 0.371 0.319 0.305 0.328 0.337 0.271 0.599
## [277] 0.192 0.654 0.573 0.363 0.346 0.246 0.406 0.358 0.333 0.473 0.223 0.467
## [289] 0.120 0.253 0.390 0.130 0.176 0.386 0.587 0.364 0.309 0.267 0.690 0.745
## [301] 0.598 0.268 0.614 0.420 0.213 0.161 0.196 0.485 0.353 0.499 0.378 0.439
## [313] 0.401 0.115 0.158 0.281 0.208 0.161 0.465 0.271 0.169 0.526 0.844 0.104
## [325] 0.250 0.387 0.292 0.322 0.265 0.140 0.336 0.411 0.186 0.196 0.396 0.181
## [337] 0.327 0.333 0.424 0.263 0.406 0.883 0.881 0.474 0.197 0.395 0.157 0.708
## [349] 0.752 0.676 0.219 0.139 0.250 0.181 0.132 0.655 0.443 0.408 0.315 0.564
## [361] 0.386 0.227 0.306 0.327 0.252 0.297 0.625 0.380 0.637 0.273 0.491 0.231
## [373] 0.176 0.357 0.808 0.950 0.456 0.417 0.286 0.406 0.191 0.480 0.351 0.293
## [385] 0.154 0.713 0.209 0.355 0.485 0.589 0.415 0.503 0.425 0.411 0.509 0.745
## [397] 0.278 0.322 0.682 0.172 0.414 0.098 0.408 0.522 0.234 0.237 0.175 0.458
## [409] 0.257 0.444 0.393 0.800 0.478 0.473 0.655 0.763 0.186 0.318 0.368 0.328
## [421] 0.402 0.385 0.300 0.473 0.602 0.717 0.446 0.313 0.351 0.486 0.742 0.456
## [433] 0.729 0.831 0.342 0.260 0.363 0.413 0.965 0.635 0.494 0.314 0.581 0.379
## [445] 0.576 0.486 0.334 0.130 0.209 0.543 0.264 0.153 0.539 0.264 0.466 0.374
## [457] 0.395 0.629 0.338 0.430 0.258 0.335 0.365 0.297 0.431 0.486 0.143 0.150
## [469] 0.369 0.456 0.257 0.346 0.116 0.346 0.184 0.284 0.127 0.481 0.444 0.325
## [481] 0.503 0.119 0.315 0.496 0.327 0.156 0.123 0.285 0.492 0.476 0.136 0.133
## [493] 0.449 0.686 0.539 0.648 0.604 0.629 0.518 0.776 0.200 0.366 0.501 0.506
## [505] 0.627 0.543 0.527 0.377 0.423 0.514 0.491 0.700 0.234 0.399 0.841 0.310
## [517] 0.644 0.400 0.387 0.292 0.373 0.398 0.466 0.378 0.359 0.241 0.581 0.314
## [529] 0.898 0.780 0.329 0.306 0.775 0.897 0.321 0.834 0.635 0.428 0.431 0.547
## [541] 0.399 0.655 0.568 0.676 0.294 0.778 0.596 0.729 0.390 0.312 0.559 0.463
## [553] 0.516 0.185 0.738 0.792 0.733 0.563 0.348 0.338 0.442 0.315 0.801 0.348
## [565] 0.469 0.444 0.376 0.415 0.300 0.316 0.454 0.427 0.383 0.659 0.477 0.387
## [577] 0.455 0.413 0.511 0.365 0.418 0.436 0.750 0.792 0.247 0.629 0.566 0.446
## [589] 0.649 0.450 0.692 0.410 0.461 0.449 0.394 0.809 0.561 0.723 0.572 0.434
## [601] 0.669 0.765 0.591 0.711 0.708 0.695 0.790 0.805 0.263 0.625 0.521 0.457
## [613] 0.651 0.513 0.572 0.793 0.578 0.701 0.456 0.708 0.296 0.612 0.718 0.985
## [625] 0.511 0.219 0.515 0.765 0.193 0.477 0.218 0.622 0.375 0.509 0.236 0.539
## [637] 0.674 0.849 0.816 0.542 0.418 0.452 0.534 0.481 0.534 0.647 0.410 0.502
## [649] 0.656 0.700 0.398 0.771 0.509 0.480 0.655 0.260 0.740 0.473 0.726 0.515
## [661] 0.414 0.613 0.452 0.596 0.227 0.739 0.459 0.369 0.476 0.384 0.724 0.293
## [673] 0.453 0.309 0.257 0.240 0.184 0.372 0.268 0.465 0.351 0.376 0.643 0.621
## [685] 0.372 0.724 0.475 0.399 0.362 0.417 0.711 0.488 0.373 0.388 0.344 0.484
## [697] 0.252 0.193 0.391 0.360 0.343 0.527 0.268 0.484 0.687 0.142 0.554 0.495
## [709] 0.447 0.240 0.292 0.516 0.338 0.380 0.452 0.602 0.800 0.333 0.345 0.246
## [721] 0.693 0.392 0.403 0.320 0.390 0.633 0.367 0.450 0.517 0.442 0.491 0.364
## [733] 0.202 0.188 0.299 0.332 0.412 0.317 0.701 0.256 0.354 0.762 0.345 0.504
## [745] 0.247 0.437 0.473 0.332 0.445 0.369 0.276 0.193 0.329 0.322 0.321 0.314
## [757] 0.516 0.600 0.330 0.792 0.299 0.345 0.644 0.582 0.239 0.379 0.472 0.679
## [769] 0.388 0.443 0.268 0.270 0.573 0.417 0.487 0.239 0.369 0.430 0.865 0.142
## [781] 0.482 0.195 0.316 0.352 0.433 0.431 0.257 0.579 0.304 0.412 0.622 0.356
## [793] 0.741 0.527 0.421 0.440 0.365 0.488 0.368 0.428 0.390 0.473 0.804 0.419
## [805] 0.332 0.433 0.356 0.886 0.588 0.341 0.480 0.274 0.403 0.295 0.428 0.210
## [817] 0.218 0.667 0.336 0.346 0.517 0.174 0.406 0.387 0.188 0.258 0.256 0.402
## [829] 0.296 0.344 0.348 0.333 0.346 0.549 0.367 0.366 0.797 0.373 0.624 0.323
## [841] 0.434 0.319 0.337 0.194 0.347 0.344 0.361 0.647 0.639 0.687 0.682 0.698
## [853] 0.817 0.799 0.619 0.358 0.311 0.390 0.779 0.793 0.118 0.126 0.326 0.278
## [865] 0.184 0.422 0.296 0.507 0.203 0.528 0.185 0.238 0.254 0.222 0.240 0.390
## [877] 0.364 0.452 0.490 0.268 0.395 0.283 0.345 0.375 0.340 0.311 0.171 0.550
## [889] 0.440 0.564 0.472 0.536 0.397 0.457 0.210 0.846 0.784 0.821 0.375 0.572
## [901] 0.327 0.338 0.226 0.504 0.397 0.191 0.352 0.244 0.193 0.657 0.394 0.390
## [913] 0.353 0.340 0.593 0.408 0.270 0.322 0.348 0.262 0.258 0.369 0.297 0.570
## [925] 0.419 0.431 0.498 0.252 0.379 0.498 0.431 0.397 0.161 0.210 0.391 0.386
## [937] 0.661 0.710 0.892 0.375 0.617 0.309 0.362 0.278 0.627 0.351 0.633 0.391
## [949] 0.606 0.277 0.604 0.451 0.448 0.334 0.699 0.402 0.276 0.334 0.350 0.204
## [961] 0.386 0.454 0.222 0.225 0.253 0.383 0.294 0.428 0.364 0.431 0.379 0.306
## [973] 0.490 0.373 0.574 0.403 0.306 0.323 0.352 0.225 0.360 0.430 0.363 0.374
## [985] 0.800 0.360 0.584 0.390 0.460 0.226 0.829 0.389 0.501 0.380 0.458 0.156
## [997] 0.759 0.618 0.444 0.428 0.490 0.343 0.487 0.423 0.619 0.673 0.464 0.865
## [1009] 0.595 0.472 0.248 0.495 0.638 0.713 0.300 0.326 0.458 0.563 0.513 0.383
## [1021] 0.352 0.507 0.337 0.656 0.404 0.476 0.445 0.836 0.474 0.814 0.612 0.370
## [1033] 0.473 0.532 0.503 0.261 0.271 0.580 0.393 0.367 0.142 0.707 0.877 0.354
## [1045] 0.727 0.962 0.768 0.638 0.735 0.491 0.409 0.198 0.426 0.300 0.438 0.454
## [1057] 0.222 0.297 0.364 0.320 0.393 0.373 0.511 0.402 0.457 0.370 0.231 0.396
## [1069] 0.714 0.424 0.333 0.628 0.627 0.614 0.340 0.357 0.162 0.574 0.203 0.724
## [1081] 0.301 0.111 0.760 0.138 0.452 0.513 0.108 0.536 0.280 0.519 0.567 0.460
## [1093] 0.322 0.409 0.372 0.288 0.364 0.228 0.530 0.431 0.243 0.488 0.364 0.333
## [1105] 0.441 0.697 0.456 0.263 0.614 0.536 0.426 0.487 0.356 0.312 0.238 0.381
## [1117] 0.539 0.596 0.222 0.350 0.586 0.240 0.743 0.379 0.283 0.342 0.249 0.131
## [1129] 0.584 0.141 0.239 0.270 0.191 0.243 0.551 0.308 0.615 0.227 0.198 0.302
## [1141] 0.362 0.414 0.221 0.152 0.708 0.238 0.371 0.250 0.303 0.149 0.307 0.507
## [1153] 0.503 0.200 0.365 0.230 0.161 0.500 0.529 0.514 0.389 0.155 0.189 0.582
## [1165] 0.442 0.137 0.196 0.149 0.178 0.237 0.426 0.506 0.386 0.360 0.373 0.207
## [1177] 0.255 0.292 0.493 0.170 0.234 0.158 0.575 0.342 0.283 0.646 0.221 0.511
## [1189] 0.217 0.102 0.163 0.516 0.319 0.332 0.262 0.299 0.454 0.251 0.383 0.303
## [1201] 0.301 0.257 0.183 0.186 0.407 0.223 0.345 0.379 0.203 0.393 0.488 0.273
## [1213] 0.156 0.504 0.350 0.451 0.435 0.493 0.142 0.510 0.397 0.656 0.361 0.446
## [1225] 0.358 0.531 0.334 0.751 0.257 0.305 0.226 0.219 0.554 0.380 0.144 0.292
## [1237] 0.113 0.192 0.282 0.250 0.261 0.217 0.347 0.190 0.151 0.271 0.437 0.237
## [1249] 0.306 0.522 0.316 0.336 0.662 0.717 0.507 0.233 0.349 0.531 0.498 0.427
## [1261] 0.370 0.528 0.461 0.287 0.447 0.412 0.469 0.252 0.546 0.634 0.435 0.432
## [1273] 0.363 0.760 0.553 0.204 0.718 0.560 0.186 0.466 0.241 0.583 0.459 0.415
## [1285] 0.439 0.185 0.230 0.357 0.704 0.388 0.157 0.239 0.267 0.430 0.452 0.516
## [1297] 0.374 0.678 0.674 0.433 0.602 0.385 0.413 0.335 0.363 0.203 0.457 0.595
## [1309] 0.129 0.408 0.379 0.434 0.400 0.533 0.449 0.646 0.374 0.654 0.480 0.506
## [1321] 0.397 0.592 0.310 0.489 0.335 0.434 0.376 0.459 0.356 0.406 0.270 0.354
## [1333] 0.123 0.243 0.354 0.347 0.352 0.392 0.696 0.425 0.586 0.376 0.248 0.385
## [1345] 0.449 0.588 0.344 0.494 0.545 0.172 0.414 0.291 0.457 0.433 0.416 0.217
## [1357] 0.239 0.932 0.391 0.572 0.621 0.682 0.243 0.293 0.556 0.331 0.443 0.769
## [1369] 0.616 0.360 0.398 0.357 0.279 0.301 0.615 0.503 0.307 0.392 0.239 0.475
## [1381] 0.307 0.195 0.315 0.149 0.488 0.306 0.202 0.415 0.552 0.610 0.159 0.352
## [1393] 0.584 0.425 0.504 0.269 0.485 0.792 0.511 0.530 0.919 0.763 0.588 0.493
## [1405] 0.606 0.673 0.638 0.596 0.629 0.648 0.511 0.548 0.218 0.726 0.298 0.493
## [1417] 0.690 0.365 0.502 0.341 0.648 0.575 0.571 0.876 0.635 0.464 0.850 0.660
## [1429] 0.744 0.433 0.713 0.498 0.494 0.455 0.544 0.439 0.381 0.443 0.528 0.300
## [1441] 0.387 0.467 0.479 0.926 0.470 0.520 0.466 0.385 0.572 0.382 0.755 0.586
## [1453] 0.471 0.401 0.541 0.352 0.417 0.357 0.338 0.910 0.500 0.478 0.536 0.771
## [1465] 0.216 0.795 0.607 0.325 0.464 0.425 0.216 0.321 0.259 0.304 0.477 0.190
## [1477] 0.356 0.452 0.617 0.321 0.737 0.690 0.174 0.869 0.141 0.577 0.479 0.426
## [1489] 0.156 0.832 0.558 0.593 0.356 0.320 0.625 0.424 0.062 0.350 0.338 0.284
## [1501] 0.276 0.341 0.998 0.478 0.217 0.381 0.623 0.392 0.427 0.974 0.358 0.569
## [1513] 0.421 0.485 0.560 0.438 0.360 0.316 0.391 0.491 0.373 0.845 0.423 0.300
## [1525] 0.378 0.357 0.295 0.334 0.224 0.191 0.339 0.491 0.312 0.344 0.743 0.350
## [1537] 0.332 0.309 0.434 0.686 0.325 0.815 0.653 0.269 0.198 0.288 0.330 0.427
## [1549] 0.420 0.356 0.315 0.323 0.397 0.355 0.494 0.794 0.299 0.649 0.727 0.335
## [1561] 0.288 0.271 0.627 0.375 0.324 0.411 0.335 0.366 0.136 0.244 0.598 0.254
## [1573] 0.448 0.406 0.224 0.239 0.247 0.270 0.373 0.521 0.455 0.443 0.357 0.310
## [1585] 0.318 0.276 0.167 0.358 0.494 0.509 0.326 0.688 0.642 0.280 0.294 0.429
## [1597] 0.401 0.274 0.266 0.545 0.293 0.587 0.253 0.470 0.327 0.427 0.301 0.340
## [1609] 0.501 0.628 0.561 0.409 0.221 0.385 0.266 0.373 0.390 0.287 0.559 0.122
## [1621] 0.390 0.250 0.262 0.433 0.311 0.464 0.520 0.432 0.291 0.377 0.226 0.217
## [1633] 0.356 0.180 0.434 0.396 0.411 0.657 0.545 0.372 0.410 0.291 0.394 0.565
## [1645] 0.192 0.308 0.652 0.295 0.300 0.385 0.526 0.366 0.571 0.434 0.583 0.577
## [1657] 0.298 0.354 0.423 0.554 0.586 0.455 0.376 0.467 0.651 0.295 0.398 0.691
## [1669] 0.806 0.221 0.380 0.398 0.341 0.418 0.190 0.184 0.634 0.216 0.863 0.912
## [1681] 0.666 0.483 0.733 0.748 0.849 0.886 0.752 0.658 0.301 0.508 0.442 0.444
## [1693] 0.778 0.281 0.371 0.115 0.774 0.215 0.409 0.518 0.177 0.164 0.378 0.347
## [1705] 0.257 0.450 0.692 0.726 0.482 0.446 0.507 0.680 0.410 0.545 0.488 0.483
## [1717] 0.523 0.616 0.584 0.381 0.472 0.540 0.356 0.339 0.434 0.302 0.247 0.519
## [1729] 0.967 0.348 0.432 0.238 0.411 0.422 0.204 0.512 0.368 0.137 0.323 0.435
## [1741] 0.509 0.471 0.185 0.226 0.376 0.296 0.654 0.242 0.689 0.194 0.419 0.799
## [1753] 0.494 0.000 0.146 0.706 0.892 0.371 0.313 0.323 0.604 0.175 0.922 0.781
## [1765] 0.833 0.290 0.159 0.348 0.405 0.161 0.260 0.493 0.372 0.565 0.387 0.788
## [1777] 0.486 0.511 0.332 0.774 0.493 0.334 0.552 0.347 0.307 0.755 0.236 0.199
## [1789] 0.405 0.292 0.261 0.372 0.168 0.554 0.246 0.263 0.356 0.434 0.850 0.447
## [1801] 0.460 0.262 0.799 0.210 0.802 0.217 0.166 0.438 0.931 0.313 0.184 0.308
## [1813] 0.464 0.289 0.432 0.433 0.183 0.367 0.236 0.378 0.194 0.344 0.287 0.223
## [1825] 0.146 0.316 0.497 0.321 0.513 0.308 0.515 0.536 0.509 0.559 0.484 0.484
## [1837] 0.271 0.587 0.375 0.269 0.284 0.351 0.268 0.385 0.408 0.459 0.269 0.350
## [1849] 0.227 0.280 0.287 0.391 0.437 0.318 0.471 0.353 0.355 0.231 0.413 0.268
## [1861] 0.762 0.853 0.344 0.319 0.581 0.536 0.272 0.156 0.895 0.076 0.382 0.665
## [1873] 0.220 0.747 0.245 0.190 0.485 0.698 0.350 0.879 0.902 0.444 0.465 0.768
## [1885] 0.181 0.871 0.595 0.399 0.757 0.624 0.259 0.430 0.496 0.530 0.461 0.719
## [1897] 0.699 0.384 0.415 0.421 0.363 0.489 0.369 0.457 0.782 0.430 0.433 0.542
## [1909] 0.433 0.412 0.671 0.507 0.497 0.439 0.839 0.131 0.137 0.431 0.321 0.652
## [1921] 0.703 0.091 0.591 0.587 0.507 0.237 0.466 0.404 0.499 0.637 0.733 0.521
## [1933] 0.126 0.512 0.655 0.531 0.694 0.473 0.490 0.487 0.451 0.834 0.735 0.765
## [1945] 0.527 0.530 0.514 0.352 0.390 0.866 0.452 0.430 0.412 0.501 0.468 0.588
## [1957] 0.319 0.205 0.405 0.443 0.635 0.313 0.225 0.679 0.282 0.323 0.715 0.535
## [1969] 0.439 0.394 0.516 0.518 0.327 0.517 0.671 0.700 0.673 0.706 0.469 0.793
## [1981] 0.745 0.564 0.569 0.451 0.789 0.417 0.594 0.528 0.709 0.423 0.524 0.612
## [1993] 0.460 0.120 0.361 0.331 0.737 0.734 0.530 0.490 0.382 0.572 0.381 0.575
## [2005] 0.237 0.234 0.332 0.278 0.164 0.713 0.236 0.297 0.232 0.209 0.295 0.288
## [2017] 0.225 0.219 0.715 0.722 0.735 0.550 0.264 0.894 0.784 0.830 0.415 0.381
## [2029] 0.970 0.417 0.286 0.478 0.333 0.665 0.388 0.877 0.458 0.343 0.811 0.296
## [2041] 0.200 0.208 0.596 0.758 0.698 0.416 0.487 0.266 0.117 0.439 0.415 0.569
## [2053] 0.512 0.442 0.151 0.434 0.626 0.373 0.416 0.470 0.799 0.119 0.465 0.181
## [2065] 0.443 0.901 0.201 0.473 0.465 0.484 0.429 0.411 0.579 0.788 0.547 0.273
## [2077] 0.675 0.594 0.549 0.538 0.610 0.500 0.524 0.344 0.408 0.103 0.391 0.410
## [2089] 0.395 0.502 0.895 0.899 0.463 0.572 0.399 0.302 0.414 0.523 0.474 0.466
## [2101] 0.155 0.898 0.256 0.318 0.425 0.315 0.559 0.423 0.706 0.832 0.508 0.561
## [2113] 0.349 0.503 0.107 0.651 0.615 0.347 0.614 0.240 0.379 0.458 0.470 0.423
## [2125] 0.421 0.222 0.273 0.544 0.503 0.377 0.234 0.418 0.818 0.218 0.255 0.426
## [2137] 0.076 0.693 0.782 0.662 0.421 0.557 0.440 0.956 0.730 0.912 0.787 0.827
## [2149] 0.818 0.970 0.867 0.600 0.797 0.264 0.599 0.847 0.510 0.617 0.379 0.435
## [2161] 0.928 0.608 0.353 0.423 0.553 0.331 0.458 0.823 0.448 0.264 0.194 0.405
## [2173] 0.357 0.183 0.813 0.512 0.945 0.421 0.537 0.486 0.454 0.350 0.560 0.485
## [2185] 0.189 0.333 0.537 0.717 0.626 0.641 0.493 0.284 0.577 0.418 0.271 0.455
## [2197] 0.291 0.419 0.333 0.425 0.199 0.249 0.382 0.440 0.895 0.472 0.401 0.419
## [2209] 0.464 0.439 0.478 0.652 0.392 0.408 0.577 0.156 0.132 0.565 0.330 0.513
## [2221] 0.473 0.342 0.371 0.446 0.449 0.511 0.577 0.364 0.650 0.588 0.214 0.255
## [2233] 0.373 0.524 0.260 0.460 0.592 0.496 0.362 0.539 0.334 0.258 0.500 0.315
## [2245] 0.442 0.402 0.219 0.172 0.551 0.737 0.385 0.534 0.670 0.585 0.534 0.333
## [2257] 0.832 0.207 0.443 0.354 0.407 0.412 0.262 0.267 0.346 0.489 0.556 0.590
## [2269] 0.585 0.528 0.414 0.337 0.147 0.304 0.690 0.186 0.106 0.235 0.364 0.366
## [2281] 0.308 0.380 0.760 0.356 0.137 0.425 0.564 0.293 0.491 0.770 0.417 0.388
## [2293] 0.882 0.160 0.743 0.289 0.274 0.122 0.548 0.202 0.775 0.761 0.202 0.303
## [2305] 0.409 0.456 0.218 0.848 0.750 0.142 0.318 0.387 0.140 0.335 0.594 0.425
## [2317] 0.150 0.555 0.356 0.561 0.566 0.223 0.362 0.529 0.218 0.448 0.292 0.316
## [2329] 0.383 0.360 0.603 0.107 0.410 0.470 0.228 0.335 0.768 0.147 0.743 0.797
## [2341] 0.643 0.392 0.411 0.326 0.447 0.439 0.353 0.484 0.316 0.204 0.988 0.315
## [2353] 0.289 0.260 0.434 0.303 0.428 0.078 0.387 0.702 0.404 0.645 0.553 0.316
## [2365] 0.337 0.547 0.335 0.465 0.357 0.333 0.365 0.341 0.346 0.340 0.460 0.484
## [2377] 0.456 0.177 0.502 0.389 0.416 0.528 0.314 0.865 0.541 0.785 0.685 0.412
## [2389] 0.392 0.142 0.303 0.318 0.444 0.263 0.607 0.453 0.520 0.306 0.171 0.557
## [2401] 0.340 0.367 0.687 0.832 0.419 0.603 0.519 0.265 0.347 0.733 0.538 0.224
## [2413] 0.136 0.291 0.201 0.317 0.850 0.302 0.322 0.122 0.655 0.225 0.141 0.579
## [2425] 0.267 0.335 0.458 0.829 0.729 0.789 0.473 0.663 0.215 0.429 0.419 0.116
## [2437] 0.239 0.340 0.241 0.827 0.306 0.422 0.270 0.310 0.377 0.242 0.677 0.741
## [2449] 0.144 0.168 0.335 0.484 0.134 0.390 0.557 0.267 0.154 0.136 0.233 0.490
## [2461] 0.565 0.891 0.342 0.467 0.590 0.828 0.413 0.486 0.226 0.683 0.235 0.804
## [2473] 0.178 0.978 0.552 0.483 0.592 0.455 0.798 0.356 0.655 0.941 0.560 0.146
## [2485] 0.508 0.594 0.491 0.520 0.531 0.887 0.465 0.382 0.609 0.640 0.225 0.416
## [2497] 0.396 0.239 0.318 0.533 0.394 0.499 0.377 0.704 0.456 0.515 0.558 0.682
## [2509] 0.428 0.534 0.873 0.665 0.404 0.190 0.612 0.228 0.348 0.717 0.236 0.418
## [2521] 0.445 0.405 0.312 0.696 0.722 0.225 0.512 0.493 0.252 0.272 0.552 0.284
## [2533] 0.646 0.464 0.288 0.571 0.646 0.390 0.538 0.477 0.176 0.607 0.339 0.411
## [2545] 0.488 0.423 0.487 0.429 0.522 0.398 0.285 0.858 0.522 0.437 0.897 0.621
## [2557] 0.399 0.342 0.416 0.487 0.572 0.488 0.459 0.482 0.925 0.762 0.393 0.506
## [2569] 0.559 0.425 0.569 0.471 0.595 0.449 0.357 0.307 0.683 0.485 0.322 0.143
## [2581] 0.478 0.597 0.176 0.413 0.376 0.459 0.353 0.299 0.471 0.457 0.666 0.369
## [2593] 0.791 0.423 0.518 0.587 0.281 0.143 0.388 0.427 0.815 0.321 0.235 0.154
## [2605] 0.462 0.433 0.426 0.639 0.584 0.284 0.261 0.220 0.355 0.256 0.735 0.559
## [2617] 0.177 0.266 0.424 0.331 0.343 0.496 0.392 0.271 0.418 0.285 0.303 0.469
## [2629] 0.814 0.215 0.376 0.738 0.460 0.462 0.495 0.370 0.392 0.465 0.702 0.790
## [2641] 0.589 0.333 0.388 0.440 0.562 0.338 0.509 0.399 0.487 0.662 0.474 0.437
## [2653] 0.388 0.340 0.425 0.702 0.419 0.930 0.717 0.653 0.726 0.339 0.900 0.656
## [2665] 0.305 0.456 0.396 0.342 0.569 0.573 0.359 0.409 0.719 0.393 0.257 0.410
## [2677] 0.481 0.457 0.201 0.374 0.705 0.377 0.454 0.354 0.275 0.364 0.513 0.523
## [2689] 0.269 0.571 0.636 0.575 0.149 0.473 0.786 0.654 0.368 0.172 0.285 0.444
## [2701] 0.344 0.320 0.336 0.434 0.318 0.295 0.420 0.514 0.322 0.173 0.465 0.532
## [2713] 0.259 0.448 0.345 0.458 0.718 0.606 0.411 0.367 0.798 0.582 0.428 0.543
## [2725] 0.216 0.404 0.268 0.354 0.580 0.388 0.145 1.000 0.399 0.212 0.438 0.576
## [2737] 0.672 0.289 0.485 0.492 0.155 0.577 0.612 0.408 0.439 0.357 0.322 0.335
## [2749] 0.386 0.117 0.274 0.862 0.358 0.298 0.696 0.480 0.828 0.743 0.349 0.741
## [2761] 0.307 0.384 0.870 0.798 0.360 0.413 0.355 0.322 0.304 0.277 0.215 0.535
## [2773] 0.348 0.249 0.461 0.750 0.375 0.496 0.441 0.342 0.110 0.319 0.263 0.173
## [2785] 0.525 0.517 0.625 0.385 0.195 0.536 0.485 0.386 0.202 0.267 0.117 0.511
## [2797] 0.278 0.166 0.801 0.795 0.407 0.332 0.471 0.304 0.397 0.305 0.514 0.569
## [2809] 0.268 0.269 0.199 0.391 0.357 0.413 0.134 0.295 0.502 0.234 0.287 0.299
## [2821] 0.380 0.154 0.239 0.468 0.826 0.313 0.617 0.239 0.339 0.522 0.443 0.869
## [2833] 0.284 0.285 0.253 0.292 0.259 0.801 0.355 0.172 0.201 0.245 0.187 0.529
## [2845] 0.375 0.395 0.388 0.231 0.427 0.431 0.380 0.445 0.506 0.173 0.322 0.286
## [2857] 0.120 0.151 0.964 0.683 0.678 0.367 0.096 0.384 0.440 0.154 0.449 0.508
## [2869] 0.355 0.309 0.302 0.405 0.366 0.308 0.387 0.387 0.178 0.217 0.443 0.324
## [2881] 0.188 0.459 0.229 0.763 0.294 0.779 0.653 0.773 0.167 0.255 0.492 0.438
## [2893] 0.154 0.420 0.318 0.309 0.356 0.207 0.216 0.366 0.165 0.261 0.315 0.211
## [2905] 0.234 0.272 0.310 0.522 0.309 0.371 0.313 0.313 0.332 0.249 0.496 0.761
## [2917] 0.255 0.320 0.104 0.230 0.548 0.482 0.458 0.457 0.595 0.378 0.474 0.462
## [2929] 0.480 0.405 0.813 0.436 0.830 0.825 0.502 0.276 0.535 0.698 0.329 0.492
## [2941] 0.461 0.400 0.390 0.367 1.000 0.903 0.677 0.569 0.266 0.247 0.610 0.314
## [2953] 0.262 0.180 0.587 0.438 0.283 0.277 0.344 0.335 0.307 0.508 0.736 0.348
## [2965] 0.359 0.224 0.439 0.421 0.169 0.486 0.517 0.423 0.287 0.223 0.472 0.537
## [2977] 0.448 0.355 0.620 0.419 0.353 0.210 0.298 0.213 0.942 0.346 0.380 0.434
## [2989] 0.213 0.434 0.296 0.447 0.265 0.329 0.226 0.332 0.276 0.426 0.689 0.296
## [3001] 0.418 0.150 0.357 0.306 0.322 0.337 0.268 0.224 0.188 0.254 0.299 0.172
## [3013] 0.219 0.505 0.243 0.160 0.206 0.362 0.729 0.683 0.228 0.940 0.712 0.475
## [3025] 0.705 0.941 0.384 0.875 0.836 0.290 0.398 0.568 0.576 0.860 0.980 0.569
## [3037] 0.648 0.299 0.321 0.597 0.760 0.778 0.314 0.233 0.748 0.221 0.465 0.759
## [3049] 0.486 0.722 0.415 0.626 0.811 0.636 0.630 0.752 0.451 0.719 0.823 0.910
## [3061] 0.905 0.495 0.596 0.579 0.644 0.766 0.598 0.701 0.817 0.575 0.259 0.422
## [3073] 0.286 0.599 0.896 0.631 0.789 0.867 0.307 0.434 0.245 0.227 0.533 0.541
## [3085] 0.519 0.724 0.731 0.609 0.402 0.435 0.689 0.337 0.332 0.174 0.667 0.730
## [3097] 0.755 0.738 0.734 0.720 0.793 0.676 0.732 0.431 0.538 0.457 0.739 0.779
## [3109] 0.892 0.350 0.866 0.410 0.172 0.711 0.810 0.716 0.699 0.671 0.632 0.426
## [3121] 0.764 0.807 0.725 0.698 0.882 0.876 0.776 0.646 0.856 0.725 0.685 0.744
## [3133] 0.766 0.606 0.443 0.850 0.680 0.916 0.664 0.814 0.396 0.843 0.453 0.316
## [3145] 0.731 0.962 0.783 0.782 0.651 0.724 0.628 0.742 0.528 0.369 0.345 0.412
## [3157] 0.345 0.464 0.763 0.480 0.560 0.617 0.777 0.560 0.310 0.825 0.640 0.454
## [3169] 0.704 0.466 0.771 0.826 0.794 0.844 0.566 0.573 0.610 0.492 0.970 0.231
## [3181] 0.700 0.253 0.457 0.259 0.631 0.655 0.610 0.556 0.647 0.521 0.859 0.451
## [3193] 0.852 0.859 0.729 0.447 0.572 0.715 0.705 0.117 0.761 0.303 0.500 0.537
## [3205] 0.106 0.296 0.075 0.712 0.607 0.676 0.779 0.474 0.315 0.269 0.330 0.561
## [3217] 0.661 0.611 0.713 0.479 0.533 0.807 0.331 0.973 0.237 0.619 0.464 0.703
## [3229] 0.535 0.546 0.536 0.624 0.719 0.838 0.501 0.903 0.629 0.754 0.727 0.662
## [3241] 0.816 0.449 0.850 0.747 0.423 0.741 0.476 0.870 0.691 0.802 0.751 0.686
## [3253] 0.649 0.673 0.806 0.442 0.580 0.814 0.738 0.627 0.707 0.707 0.754 0.716
## [3265] 0.542 0.726 0.686 0.435 0.541 0.559 0.725 0.568 0.260 0.575 1.000 0.962
## [3277] 0.507 0.279 0.762 0.164 0.549 0.568 0.821 0.354 0.440 0.449 0.797 0.807
## [3289] 0.282 0.760 0.611 0.449 0.584 0.597 0.812 0.465 0.702 0.698 0.708 0.801
## [3301] 0.824 0.738 0.847 0.874 0.532 0.378 0.585 0.253 0.667 0.457 0.323 0.657
## [3313] 0.614 0.516 0.770 0.821 0.663 0.601 0.688 0.689 0.535 0.802 0.441 0.433
## [3325] 0.445 0.713 0.679 0.724 0.782 0.722 0.495 0.522 0.376 0.723 0.641 0.481
## [3337] 0.652 0.669 0.385 0.630 0.462 0.635 0.506 0.574 0.213 0.420 0.665 0.759
## [3349] 0.622 0.347 0.636 0.152 0.725 0.260 0.572 0.720 0.976 0.967 0.660 0.762
## [3361] 0.754 0.770 0.182 0.461 0.355 0.916 0.773 0.554 0.522 0.551 0.410 0.735
## [3373] 0.688 0.649 0.338 0.291 0.786 0.256 0.088 0.555 0.749 0.741 0.781 0.692
## [3385] 0.443 0.062 0.513 0.741 0.528 0.232 0.719 0.788 0.604 0.800 0.791 0.786
## [3397] 0.701 0.488 0.550 0.471 0.790 0.656 0.804 0.635 0.681 0.566 0.662 0.493
## [3409] 0.760 0.773 0.563 0.637 0.646 0.665 0.304 0.162 0.755 0.345 0.652 0.949
## [3421] 0.770 0.888 0.852 0.685 0.748 0.763 0.478 0.849 0.553 0.758 0.773 0.656
## [3433] 0.881 0.735 0.775 0.684 0.691 0.552 0.405 0.325 0.564 0.607 0.697 0.682
## [3445] 0.833 0.765 0.116 0.719 0.805 0.647 0.354 0.243 0.325 0.783 0.855 0.679
## [3457] 0.394 0.576 0.877 0.743 0.884 0.893 0.617 0.347 0.470 0.330 0.941 0.757
## [3469] 0.396 0.600 0.585 0.627 0.067 0.684 0.871 0.719 0.691 0.626 0.696 0.218
## [3481] 0.666 0.724 0.642 0.359 0.847 0.270 0.719 0.855 0.197 0.522 0.658 0.731
## [3493] 0.715 0.389 0.808 0.262 0.603 0.403 0.753 0.425 0.481 0.617 0.287 0.989
## [3505] 0.711 0.519 0.740 0.711 0.721 0.677 0.819 0.866 0.785 0.739 0.626 0.237
## [3517] 0.506 0.727 0.755 0.793 0.754 0.770 0.678 0.708 0.193 0.813 0.905 0.615
## [3529] 0.398 0.270 0.797 0.484 0.617 0.700 0.424 0.344 0.563 0.698 0.978 0.653
## [3541] 0.847 0.614 0.727 0.667 0.598 0.711 0.749 0.492 0.662 0.803 0.954 0.617
## [3553] 0.609 0.369 0.689 0.729 0.752 0.564 0.328 0.778 0.695 0.253 0.539 0.511
## [3565] 0.630 0.328 0.683 0.333 0.643 0.654 0.498 0.321 0.651 0.500 0.210 0.809
## [3577] 0.909 0.554 0.664 0.794 0.462 0.552 0.575 0.572 0.351 0.698 0.597 0.700
## [3589] 0.484 0.876 0.749 0.736 0.702 0.835 0.787 0.350 0.788 0.703 0.444 0.586
## [3601] 0.675 0.855 0.897 0.510 0.776 0.724 0.477 0.477 0.656 0.315 0.582 0.531
## [3613] 0.652 0.092 0.791 0.847 0.646 0.608 0.892 0.595 0.458 0.797 0.755 0.735
## [3625] 0.774 0.804 0.824 0.759 0.730 0.740 0.764 0.440 0.653 0.603 0.858 0.677
## [3637] 0.547 0.735 0.393 0.638 0.630 0.699 0.460 1.000 1.000 0.737 0.869 0.586
## [3649] 0.544 0.429 0.444 0.172 0.652 0.713 0.674 0.658 0.986 0.632 0.714 0.986
## [3661] 0.707 0.864 0.756 0.703 0.675 0.803 0.738 0.755 0.761 0.765 0.914 0.611
## [3673] 0.736 0.705 0.659 0.765 0.807 0.727 0.848 0.686 0.769 0.770 0.225 0.679
## [3685] 0.960 0.835 0.695 0.407 0.811 0.689 0.593 0.658 0.568 0.647 0.721 0.741
## [3697] 0.742 0.729 0.690 0.710 0.729 0.776 0.582 0.678 0.582 0.905 0.852 0.654
## [3709] 0.715 0.712 0.529 0.839 0.946 0.896 0.746 0.787 0.316 0.270 0.498 0.508
## [3721] 0.905 0.766 0.453 0.894 0.506 0.298 0.557 0.573 0.787 0.293 0.598 0.676
## [3733] 0.316 0.528 0.813 0.821 0.929 0.930 0.877 0.803 0.431 0.500 0.891 0.878
## [3745] 0.759 0.733 0.822 0.790 0.704 0.799 0.589 0.782 0.744 0.643 0.871 0.617
## [3757] 0.712 0.663 0.848 0.661 0.447 0.169 0.402 0.483 0.910 0.846 0.843 0.731
## [3769] 0.689 0.552 0.640 0.736 0.774 0.706 0.673 0.689 0.499 0.102 0.628 0.890
## [3781] 0.830 0.710 0.374 0.843 0.679 0.620 0.243 0.196 0.128 0.495 0.029 0.744
## [3793] 0.263 0.283 0.531 0.408 0.865
#Let's check just to be sure
pellvalue_density <- density(college$pell_value)
plot(pellvalue_density)
pellvalue_density_n <- density(pellvalue_n)
plot(pellvalue_density_n)
abc <- names(select_if(college, is.numeric))# select function to find the numeric variables and create a character string
abc
## [1] "hbcu" "flagship" "student_count" "aid_value"
## [5] "pell_value"
#Use lapply to normalize the numeric values
college[abc] <- lapply(college[abc], normalize)#use apply again with the normalizer function we created.
str(college)
## tibble [3,797 × 7] (S3: tbl_df/tbl/data.frame)
## $ level : Factor w/ 2 levels "2-year","4-year": 2 2 2 2 2 2 1 2 2 2 ...
## $ control : Factor w/ 3 levels "Private for-profit",..: 3 3 2 3 3 3 3 3 3 2 ...
## $ hbcu : num [1:3797] 1 0 0 0 1 0 0 0 0 0 ...
## $ flagship : num [1:3797] 0 0 0 0 0 1 0 0 0 0 ...
## $ student_count: num [1:3797] 0.02368 0.06748 0.00176 0.03335 0.03135 ...
## $ aid_value : num [1:3797] 0.1659 0.1403 0.0544 0.1539 0.1686 ...
## $ pell_value : num [1:3797] 0.712 0.351 0.684 0.328 0.827 0.211 0.651 0.401 0.169 0.214 ...
# Next let's one-hot encode those factor variables/character
class(college)
## [1] "tbl_df" "tbl" "data.frame"
?one_hot#what issue will we run into here?
## starting httpd help server ... done
college_1h <- one_hot(as.data.table(college),cols = "auto",sparsifyNAs = FALSE,naCols = FALSE,dropCols = TRUE,dropUnusedLevels = TRUE)#one_hot function requires a data.table class so we coerce the format.
?one_hot# looks at the various arguments
str(college_1h)#what looks different?
## Classes 'data.table' and 'data.frame': 3797 obs. of 10 variables:
## $ level_2-year : int 0 0 0 0 0 0 1 0 0 0 ...
## $ level_4-year : int 1 1 1 1 1 1 0 1 1 1 ...
## $ control_Private for-profit : int 0 0 0 0 0 0 0 0 0 0 ...
## $ control_Private not-for-profit: int 0 0 1 0 0 0 0 0 0 1 ...
## $ control_Public : int 1 1 0 1 1 1 1 1 1 0 ...
## $ hbcu : num 1 0 0 0 1 0 0 0 0 0 ...
## $ flagship : num 0 0 0 0 0 1 0 0 0 0 ...
## $ student_count : num 0.02368 0.06748 0.00176 0.03335 0.03135 ...
## $ aid_value : num 0.1659 0.1403 0.0544 0.1539 0.1686 ...
## $ pell_value : num 0.712 0.351 0.684 0.328 0.827 0.211 0.651 0.401 0.169 0.214 ...
## - attr(*, ".internal.selfref")=<externalptr>
#Essentially the target to which we are trying to out perform with our model. Percentage represented by the positive class. Continuous we are going to turn this into a Boolean to be used for classification by selecting the top quartile of values.
(box <- boxplot(college_1h$pell_value, horizontal = TRUE))
## $stats
## [,1]
## [1,] 0.000
## [2,] 0.324
## [3,] 0.447
## [4,] 0.625
## [5,] 1.000
##
## $n
## [1] 3797
##
## $conf
## [,1]
## [1,] 0.439282
## [2,] 0.454718
##
## $out
## numeric(0)
##
## $group
## numeric(0)
##
## $names
## [1] "1"
box$stats
## [,1]
## [1,] 0.000
## [2,] 0.324
## [3,] 0.447
## [4,] 0.625
## [5,] 1.000
fivenum(college$pell_value)
## [1] 0.000 0.324 0.447 0.625 1.000
?fivenum#thanks Tukey!
#added this a predictor versus replacing the numeric version
(college_1h$pellvalue_f <- cut(college_1h$pell_value,c(-1,0.43,1),labels = c(0,1)))#why the NA? If we want two segments we input three numbers, start, cut and stop values
## [1] 1 0 1 0 1 0 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 0 1 0
## [38] 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 0 1 0 1 1 1
## [75] 0 0 0 0 1 1 1 0 0 1 1 1 1 0 0 1 1 0 1 1 1 1 0 0 1 0 1 0 0 0 1 1 0 1 0 1 1
## [112] 1 1 0 1 0 1 0 1 1 0 1 0 0 1 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 1 0 1 0 1 0 0 0
## [149] 1 0 1 0 0 0 1 1 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 0 0 1 0 1
## [186] 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 0
## [223] 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1
## [260] 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0
## [297] 0 0 1 1 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 1 0
## [371] 1 0 0 0 1 1 1 0 0 0 0 1 0 0 0 1 0 0 1 1 0 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0
## [408] 1 0 1 0 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1 1 1 0 0 0 0 1 1 1 0 1 0
## [445] 1 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1
## [482] 0 0 1 0 0 0 0 1 1 0 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 0 1 1 1 0 0 1 0 1 0
## [519] 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1 1 0 0 1 1 1 0 1
## [556] 1 1 1 0 0 1 0 1 0 1 1 0 0 0 0 1 0 0 1 1 0 1 0 1 0 0 1 1 1 0 1 1 1 1 1 1 0
## [593] 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0
## [630] 1 0 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1
## [667] 1 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 1 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0
## [704] 1 1 0 1 1 1 0 0 1 0 0 1 1 1 0 0 0 1 0 0 0 0 1 0 1 1 1 1 0 0 0 0 0 0 0 1 0
## [741] 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 1 0 0 1 0 1 0 0
## [778] 0 1 0 1 0 0 0 1 1 0 1 0 0 1 0 1 1 0 1 0 1 0 0 0 1 1 0 0 1 0 1 1 0 1 0 0 0
## [815] 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 1 1 1 1
## [852] 1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1
## [889] 1 1 1 1 0 1 0 1 1 1 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0
## [926] 1 1 0 0 1 1 0 0 0 0 0 1 1 1 0 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 0 0 0 0 1
## [963] 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 1
## [1000] 0 1 0 1 0 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 0 0 1 0 1 0 1 1 1 1 1 1 0 1 1 1 0
## [1037] 0 1 0 0 0 1 1 0 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1
## [1074] 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 0 1 1 1 0 1 1
## [1111] 0 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0
## [1148] 0 0 0 0 1 1 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0
## [1185] 0 1 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 1 0 1 0
## [1222] 1 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1 0 0 1
## [1259] 1 0 0 1 1 0 1 0 1 0 1 1 1 1 0 1 1 0 1 1 0 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 1
## [1296] 1 0 1 1 1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 1 1 0 1 1 1 0 1 0 1 0 1 0 1 0 0 0 0
## [1333] 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 1 1 0 0 0 1 1 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1
## [1370] 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1
## [1407] 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1
## [1444] 1 1 1 1 0 1 0 1 1 1 0 1 0 0 0 0 1 1 1 1 1 0 1 1 0 1 0 0 0 0 0 1 0 0 1 1 0
## [1481] 1 1 0 1 0 1 1 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 1 1 1 0
## [1518] 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0
## [1555] 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 0
## [1592] 1 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 1
## [1629] 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 1 0 1 1 1 1 0 0 0 1 1 1 0 1 1
## [1666] 0 0 1 1 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0 1 0 0 1 0 0
## [1703] 0 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0
## [1740] 1 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 1 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 1 0 1
## [1777] 1 1 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 0 1 0 0 1 1 0 0 0 1
## [1814] 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 1 1 1 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0
## [1851] 0 0 1 0 1 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 1 0 1 0 0 1 1 0 1 1 1 1 1 0 1 1
## [1888] 0 1 1 0 0 1 1 1 1 1 0 0 0 0 1 0 1 1 0 1 1 1 0 1 1 1 1 1 0 0 1 0 1 1 0 1 1
## [1925] 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 0 0 1 1 1 0 0 0 1 1
## [1962] 0 0 1 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 0 0 1 1
## [1999] 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 1 0 0 1 0 0 1 0 1 0
## [2036] 1 1 0 1 0 0 0 1 1 1 0 1 0 0 1 0 1 1 1 0 1 1 0 0 1 1 0 1 0 1 1 0 1 1 1 0 0
## [2073] 1 1 1 0 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 1 1 0 1 0 0 0 0 1 0 1
## [2110] 1 1 1 0 1 0 1 1 0 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 1 1 0 1 1 1 1 1
## [2147] 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 0 1 0 1 1 1 0 0 0 0 0 1 1 1 0 1 1 1 0 1
## [2184] 1 0 0 1 1 1 1 1 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 1 1 1 1 0 0 1 0 0 1 0 1
## [2221] 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 1 1 0 1 1 1 1 0 1
## [2258] 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 0
## [2295] 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 0 0 1 0 1 0 0 0 0 1
## [2332] 0 0 1 0 0 1 0 1 1 1 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 1 0 0 1 0 1
## [2369] 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 1 1 1 1 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0 1 1 0
## [2406] 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 1 0 0
## [2443] 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 1 1 1 0 1 1 1 0 1 0 1 0 1 0 1 1 1 1 1 1
## [2480] 0 1 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 1 1 0 1 1 1 0 0 1 0
## [2517] 0 1 0 0 1 0 0 1 1 0 1 1 0 0 1 0 1 1 0 1 1 0 1 1 0 1 0 0 1 0 1 0 1 0 0 1 1
## [2554] 1 1 1 0 0 0 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 0 1 1 0 0 1 1 0 0 0 1 0 0 1 1
## [2591] 1 0 1 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0
## [2628] 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0 0 1 0 1 1 1 1 0 1 1
## [2665] 0 1 0 0 1 1 0 0 1 0 0 0 1 1 0 0 1 0 1 0 0 0 1 1 0 1 1 1 0 1 1 1 0 0 0 1 0
## [2702] 0 0 1 0 0 0 1 0 0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 1 1 1 0
## [2739] 1 1 0 1 1 0 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1
## [2776] 1 0 1 1 0 0 0 0 0 1 1 1 0 0 1 1 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 1 0 0 0 0
## [2813] 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0
## [2850] 1 0 1 1 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1
## [2887] 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 1 1
## [2924] 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 1 0 0 0 1 1 0 0 0 0
## [2961] 0 1 1 0 0 0 1 0 0 1 1 0 0 0 1 1 1 0 1 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0
## [2998] 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1
## [3035] 1 1 1 0 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
## [3072] 0 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 0 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3109] 1 0 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1
## [3146] 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0
## [3183] 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 0 0 1 1 1 1 1 0 0 0 1 1 1 1
## [3220] 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
## [3257] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 0 1 1 1 1
## [3294] 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3331] 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1
## [3368] 1 1 1 0 1 1 1 0 0 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3405] 1 1 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1
## [3442] 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 0 1 1 1 1 1
## [3479] 1 0 1 1 1 0 1 0 1 1 0 1 1 1 1 0 1 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
## [3516] 0 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [3553] 1 0 1 1 1 1 0 1 1 0 1 1 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1
## [3590] 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
## [3627] 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1
## [3664] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
## [3701] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1
## [3738] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1
## [3775] 1 1 1 0 1 1 1 1 0 1 1 1 0 0 0 1 0 1 0 0 1 0 1
## Levels: 0 1
?cut
View(college_1h)
#So no let's check the prevalence
(prevalence <- table(college_1h$pellvalue_f)[[2]]/length(college_1h$pellvalue_f))#we are using [[]] to pull at the second entry/column in the table
## [1] 0.534896
table(college_1h$pellvalue_f)
##
## 0 1
## 1766 2031
21/(21+55)
## [1] 0.2763158
length(college_1h)
## [1] 11
# Training|Evaluation, Tune|Evaluation, Test|Evaluation
# Divide up our data into three parts, Training, Tuning, and Test
#There is not a easy way to create 3 partitions using the createDataPartitions
#so we are going to use it twice. Mostly because we want to stratify on the variable we are working to predict. What does that mean?
part_index_1 <- caret::createDataPartition(college_1h$pellvalue_f,
times=1,#number of splits
p = 0.70,#percentage of split
groups=1,
list=FALSE)
View(part_index_1)
dim(college_1h)
## [1] 3797 11
train <- college_1h[part_index_1,]#index the 70%
tune_and_test <- college_1h[-part_index_1, ]#index everything but the %70
#The we need to use the function again to create the tuning set
tune_and_test_index <- createDataPartition(tune_and_test$pellvalue_f,
p = .5,
list = FALSE,
times = 1)
tune <- tune_and_test[tune_and_test_index, ]
test <- tune_and_test[-tune_and_test_index, ]
dim(train)
## [1] 2659 11
dim(tune)
## [1] 570 11
dim(test)
## [1] 568 11
table(train$pellvalue_f)#check the prevalance
##
## 0 1
## 1237 1422
15/(40+15)
## [1] 0.2727273
table(test$pellvalue_f)
##
## 0 1
## 264 304
3/(8+3)
## [1] 0.2727273
table(tune$pellvalue_f)# same as above
##
## 0 1
## 265 305
My instincts tell me that this data will be a reliable source and has the necessay variables to address my problem/question. However, I am worried that I may have deleted potential useful columns but that is definitely a fixable mistake. Additionally, I am worried because alot of the data prior to cleaning had null values.
What to study to have the highest chance of being place in a job?
Higher salary may mean higher potential for alumni donations.
table(jobs$gender)
##
## F M
## 76 139
table(jobs$ssc_b)
##
## Central Others
## 116 99
table(jobs$hsc_b)
##
## Central Others
## 84 131
table(jobs$hsc_s)
##
## Arts Commerce Science
## 11 113 91
table(jobs$degree_t)
##
## Comm&Mgmt Others Sci&Tech
## 145 11 59
table(jobs$workex)
##
## No Yes
## 141 74
table(jobs$specialisation)
##
## Mkt&Fin Mkt&HR
## 120 95
table(jobs$status)
##
## Not Placed Placed
## 67 148
# Let's look at min-max scaling, placing the numbers between 0 and 1.
###Build our own normalizer, which is maybe how I would go if given the option. If you need to do multiple columns use lapply. See this referred to as a min-max scaler function.
normalize <- function(x){
# x is a numeric vector because the functions min and max require
#numeric inputs
(x - min(x)) / (max(x) - min(x))#numerator subtracts the minimum value of x from the entire column, denominator essentially calculates the range of x
}
(degree_n <- normalize(jobs$degree_p))
## [1] 0.195121951 0.670243902 0.341463415 0.048780488 0.568292683 0.420731707
## [7] 0.707317073 0.390243902 0.536585366 0.268292683 0.243902439 0.690243902
## [13] 0.365853659 0.219512195 0.000000000 0.463414634 0.380487805 0.341463415
## [19] 0.341463415 0.487804878 0.390243902 0.853658537 0.542195122 0.359512195
## [25] 0.703902439 0.004878049 0.390243902 0.390243902 0.426829268 0.195121951
## [31] 0.560975610 0.365853659 0.400000000 0.756097561 0.048780488 0.536585366
## [37] 0.170731707 0.380487805 0.390243902 0.341463415 0.731707317 0.365853659
## [43] 0.365853659 0.439024390 0.756097561 0.536585366 0.380487805 0.170731707
## [49] 0.439024390 0.048780488 0.448780488 0.151219512 0.073170732 0.536585366
## [55] 0.463414634 0.365853659 0.278048780 0.585365854 0.439024390 0.539268293
## [61] 0.536585366 0.411951220 0.424390244 0.341463415 0.609756098 0.170731707
## [67] 0.390243902 0.414634146 0.553658537 0.390243902 0.292682927 0.512195122
## [73] 0.682926829 0.529756098 0.492682927 0.670731707 0.534878049 0.365853659
## [79] 0.353658537 0.390243902 0.463414634 0.414634146 0.585365854 0.439024390
## [85] 0.487804878 0.663414634 0.341463415 0.243902439 0.560975610 0.463414634
## [91] 0.780487805 0.019512195 0.390243902 0.097560976 0.341463415 0.365853659
## [97] 0.634146341 0.268292683 0.365853659 0.317073171 0.195121951 0.439024390
## [103] 0.439024390 0.560975610 0.365853659 0.195121951 0.097560976 0.804878049
## [109] 0.463414634 0.365853659 0.536585366 0.268292683 0.268292683 0.414634146
## [115] 0.463414634 0.390243902 0.404878049 0.560975610 0.682926829 0.356097561
## [121] 0.219512195 0.478048780 0.470731707 0.560975610 0.349512195 0.560975610
## [127] 0.621951220 0.463414634 0.676097561 0.390243902 0.243902439 0.292682927
## [133] 0.341463415 0.658536585 0.536585366 0.463414634 0.341463415 0.536585366
## [139] 0.560975610 0.219512195 0.475609756 0.243902439 0.571463415 0.504146341
## [145] 0.268292683 0.518292683 0.390243902 0.365853659 0.146341463 0.121951220
## [151] 0.195121951 0.609756098 0.829268293 0.365853659 0.243902439 0.241463415
## [157] 0.365853659 0.265853659 0.341463415 0.195121951 0.365853659 0.182926829
## [163] 0.664634146 0.341463415 0.325609756 0.585365854 0.243902439 0.414634146
## [169] 0.195121951 0.274634146 0.243902439 0.536585366 0.146341463 0.121951220
## [175] 0.348048780 0.365853659 0.146341463 0.707317073 0.439024390 0.346341463
## [181] 0.312195122 0.346585366 0.170731707 0.463414634 0.238780488 0.682926829
## [187] 0.268292683 0.414634146 0.106829268 0.468292683 0.268292683 0.536585366
## [193] 0.360975610 0.146341463 0.153658537 0.536585366 0.670731707 1.000000000
## [199] 0.365853659 0.170731707 0.365853659 0.195121951 0.390243902 0.167560976
## [205] 0.560975610 0.365853659 0.243902439 0.268292683 0.365853659 0.365853659
## [211] 0.673170732 0.536585366 0.560975610 0.195121951 0.073170732
#Let's check just to be sure
degreep_density <- density(jobs$degree_p)
plot(degreep_density)
degreep_density_n <- density(degree_n)
plot(degreep_density_n)
abc <- names(select_if(jobs, is.numeric))# select function to find the numeric variables and create a character string
abc
## [1] "sl_no" "ssc_p" "hsc_p" "degree_p" "etest_p" "mba_p" "salary"
#Use lapply to normalize the numeric values
jobs[abc] <- lapply(jobs[abc], normalize) #use apply again with the normalizer function we created.
str(jobs)
## spec_tbl_df [215 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ sl_no : num [1:215] 0 0.00467 0.00935 0.01402 0.01869 ...
## $ gender : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 1 2 2 2 ...
## $ ssc_p : num [1:215] 0.538 0.792 0.497 0.311 0.926 ...
## $ ssc_b : Factor w/ 2 levels "Central","Others": 2 1 1 1 1 2 2 1 1 1 ...
## $ hsc_p : num [1:215] 0.89 0.681 0.511 0.247 0.603 ...
## $ hsc_b : Factor w/ 2 levels "Central","Others": 2 2 1 1 1 2 2 1 1 1 ...
## $ hsc_s : Factor w/ 3 levels "Arts","Commerce",..: 2 3 1 3 2 3 2 3 2 2 ...
## $ degree_p : num [1:215] 0.1951 0.6702 0.3415 0.0488 0.5683 ...
## $ degree_t : Factor w/ 3 levels "Comm&Mgmt","Others",..: 3 3 1 3 1 3 1 3 1 1 ...
## $ workex : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ etest_p : num [1:215] 0.104 0.76 0.521 0.333 0.975 ...
## $ specialisation: Factor w/ 2 levels "Mkt&Fin","Mkt&HR": 2 1 1 2 1 1 1 1 1 1 ...
## $ mba_p : num [1:215] 0.284 0.565 0.247 0.308 0.161 ...
## $ status : Factor w/ 2 levels "Not Placed","Placed": 2 2 2 1 2 1 1 2 2 1 ...
## $ salary : num [1:215] NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. sl_no = col_double(),
## .. gender = col_character(),
## .. ssc_p = col_double(),
## .. ssc_b = col_character(),
## .. hsc_p = col_double(),
## .. hsc_b = col_character(),
## .. hsc_s = col_character(),
## .. degree_p = col_double(),
## .. degree_t = col_character(),
## .. workex = col_character(),
## .. etest_p = col_double(),
## .. specialisation = col_character(),
## .. mba_p = col_double(),
## .. status = col_character(),
## .. salary = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Next let's one-hot encode those factor variables/character
class(jobs)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
?one_hot#what issue will we run into here?
jobs_1h <- one_hot(as.data.table(jobs),cols = "auto",sparsifyNAs = FALSE,naCols = FALSE,dropCols = TRUE,dropUnusedLevels = TRUE)#one_hot function requires a data.table class so we coerce the format.
?one_hot# looks at the various arguments
str(jobs_1h)#what looks different?
## Classes 'data.table' and 'data.frame': 215 obs. of 25 variables:
## $ sl_no : num 0 0.00467 0.00935 0.01402 0.01869 ...
## $ gender_F : int 0 0 0 0 0 0 1 0 0 0 ...
## $ gender_M : int 1 1 1 1 1 1 0 1 1 1 ...
## $ ssc_p : num 0.538 0.792 0.497 0.311 0.926 ...
## $ ssc_b_Central : int 0 1 1 1 1 0 0 1 1 1 ...
## $ ssc_b_Others : int 1 0 0 0 0 1 1 0 0 0 ...
## $ hsc_p : num 0.89 0.681 0.511 0.247 0.603 ...
## $ hsc_b_Central : int 0 0 1 1 1 0 0 1 1 1 ...
## $ hsc_b_Others : int 1 1 0 0 0 1 1 0 0 0 ...
## $ hsc_s_Arts : int 0 0 1 0 0 0 0 0 0 0 ...
## $ hsc_s_Commerce : int 1 0 0 0 1 0 1 0 1 1 ...
## $ hsc_s_Science : int 0 1 0 1 0 1 0 1 0 0 ...
## $ degree_p : num 0.1951 0.6702 0.3415 0.0488 0.5683 ...
## $ degree_t_Comm&Mgmt : int 0 0 1 0 1 0 1 0 1 1 ...
## $ degree_t_Others : int 0 0 0 0 0 0 0 0 0 0 ...
## $ degree_t_Sci&Tech : int 1 1 0 1 0 1 0 1 0 0 ...
## $ workex_No : int 1 0 1 1 1 0 1 0 1 1 ...
## $ workex_Yes : int 0 1 0 0 0 1 0 1 0 0 ...
## $ etest_p : num 0.104 0.76 0.521 0.333 0.975 ...
## $ specialisation_Mkt&Fin: int 0 1 1 0 1 1 1 1 1 1 ...
## $ specialisation_Mkt&HR : int 1 0 0 1 0 0 0 0 0 0 ...
## $ mba_p : num 0.284 0.565 0.247 0.308 0.161 ...
## $ status_Not Placed : int 0 0 0 1 0 1 1 0 0 1 ...
## $ status_Placed : int 1 1 1 0 1 0 0 1 1 0 ...
## $ salary : num NA NA NA NA NA NA NA NA NA NA ...
## - attr(*, ".internal.selfref")=<externalptr>
#Essentially the target to which we are trying to out perform with our model. Percentage represented by the positive class. Continuous we are going to turn this into a Boolean to be used for classification by selecting the top quartile of values.
(box <- boxplot(jobs_1h$degree_p, horizontal = TRUE))
## $stats
## [,1]
## [1,] 0.0000000
## [2,] 0.2682927
## [3,] 0.3902439
## [4,] 0.5365854
## [5,] 0.8536585
##
## $n
## [1] 215
##
## $conf
## [,1]
## [1,] 0.3613340
## [2,] 0.4191538
##
## $out
## [1] 1
##
## $group
## [1] 1
##
## $names
## [1] "1"
box$stats
## [,1]
## [1,] 0.0000000
## [2,] 0.2682927
## [3,] 0.3902439
## [4,] 0.5365854
## [5,] 0.8536585
fivenum(jobs$degree_p)
## [1] 0.0000000 0.2682927 0.3902439 0.5365854 1.0000000
?fivenum#thanks Tukey!
#added this a predictor versus replacing the numeric version
(jobs_1h$degreep_f <- cut(jobs_1h$degree_p,c(-1,0.43,1),labels = c(0,1)))#why the NA? If we want two segments we input three numbers, start, cut and stop values
## [1] 0 1 0 0 1 0 1 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 0 0 1 0 1 0
## [38] 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 1 0 0 1 1 1 1 0 0 0 1 0 0 0 1 0 0 1 1 1
## [75] 1 1 1 0 0 0 1 0 1 1 1 1 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 1 1 0 1
## [112] 0 0 0 1 0 0 1 1 0 0 1 1 1 0 1 1 1 1 0 0 0 0 1 1 1 0 1 1 0 1 0 1 1 0 1 0 0
## [149] 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 0
## [186] 1 0 0 0 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0
## Levels: 0 1
?cut
View(jobs_1h)
#So no let's check the prevalence
(prevalence <- table(jobs_1h$degreep_f)[[2]]/length(jobs_1h$degreep_f))#we are using [[]] to pull at the second entry/column in the table
## [1] 0.3953488
table(jobs_1h$degreep_f)
##
## 0 1
## 130 85
21/(21+55)
## [1] 0.2763158
length(jobs_1h)
## [1] 26
# Training|Evaluation, Tune|Evaluation, Test|Evaluation
# Divide up our data into three parts, Training, Tuning, and Test
#There is not a easy way to create 3 partitions using the createDataPartitions
#so we are going to use it twice. Mostly because we want to stratify on the variable we are working to predict. What does that mean?
jobs_dt <- jobs_1h[,-c("sl_no","salary")]#using indexing to drop these two columns, creating a new dataframe so we don't delete these columns from our working environment.
view(jobs_dt)
part_index_1 <- caret::createDataPartition(jobs_dt$degreep_f,
times=1,#number of splits
p = 0.70,#percentage of split
groups=1,
list=FALSE)
View(part_index_1)
dim(jobs_dt)
## [1] 215 24
train <- jobs_dt[part_index_1,] #index the 70%
tune_and_test <- jobs_dt[-part_index_1, ] #index everything but the %70
#The we need to use the function again to create the tuning set
tune_and_test_index <- createDataPartition(tune_and_test$degreep_f,
p = .5,
list = FALSE,
times = 1)
tune <- tune_and_test[tune_and_test_index, ]
test <- tune_and_test[-tune_and_test_index, ]
dim(train)
## [1] 151 24
dim(tune)
## [1] 33 24
dim(test)
## [1] 31 24
table(train$degreep_f) #check the prevalance
##
## 0 1
## 91 60
15/(40+15)
## [1] 0.2727273
table(test$degreep_f)
##
## 0 1
## 19 12
3/(8+3)
## [1] 0.2727273
table(tune$degreep_f) # same as above
##
## 0 1
## 20 13
I believe that this data has the necessary columns and attributes to be able to solve my problem/question. I am worried that with alot of the salary info being null values, that may inhibit any questions regarding maximizing salary potential.