library(utils) library(tinytex) library(stats) library(psych) library(methods) library(knitr) library(datasets) library(base)

library(readxl)
train <- read_excel("C:/Users/earth/Downloads/train.xlsx")

view(train)

Titanic: Machine Learning through Disaster Competition

Melanie Bosch

as.data.frame(train)

Question 1

What are the types of variable quantitative / qualitative) and levels of measurement (nominal/ ordinal/ interval/ ratio) for PassengerId, and Age?

library(psych)
View(train)
eval(train)
## # A tibble: 891 × 12
##    PassengerId Survived Pclass Name   Sex     Age SibSp Parch Ticket  Fare Cabin
##          <dbl>    <dbl>  <dbl> <chr>  <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr>
##  1           1        0      3 Braun… male     22     1     0 A/5 2…  7.25 <NA> 
##  2           2        1      1 Cumin… fema…    38     1     0 PC 17… 71.3  C85  
##  3           3        1      3 Heikk… fema…    26     0     0 STON/…  7.92 <NA> 
##  4           4        1      1 Futre… fema…    35     1     0 113803 53.1  C123 
##  5           5        0      3 Allen… male     35     0     0 373450  8.05 <NA> 
##  6           6        0      3 Moran… male     NA     0     0 330877  8.46 <NA> 
##  7           7        0      1 McCar… male     54     0     0 17463  51.9  E46  
##  8           8        0      3 Palss… male      2     3     1 349909 21.1  <NA> 
##  9           9        1      3 Johns… fema…    27     0     2 347742 11.1  <NA> 
## 10          10        1      2 Nasse… fema…    14     1     0 237736 30.1  <NA> 
## # … with 881 more rows, and 1 more variable: Embarked <chr>
describe(train)
##             vars   n   mean     sd median trimmed    mad  min    max  range
## PassengerId    1 891 446.00 257.35 446.00  446.00 330.62 1.00 891.00 890.00
## Survived       2 891   0.38   0.49   0.00    0.35   0.00 0.00   1.00   1.00
## Pclass         3 891   2.31   0.84   3.00    2.39   0.00 1.00   3.00   2.00
## Name*          4 891 446.00 257.35 446.00  446.00 330.62 1.00 891.00 890.00
## Sex*           5 891   1.65   0.48   2.00    1.68   0.00 1.00   2.00   1.00
## Age            6 714  29.70  14.53  28.00   29.27  13.34 0.42  80.00  79.58
## SibSp          7 891   0.52   1.10   0.00    0.27   0.00 0.00   8.00   8.00
## Parch          8 891   0.38   0.81   0.00    0.18   0.00 0.00   6.00   6.00
## Ticket*        9 891 339.52 200.83 338.00  339.65 268.35 1.00 681.00 680.00
## Fare          10 891  32.20  49.69  14.45   21.38  10.24 0.00 512.33 512.33
## Cabin*        11 204  77.00  42.23  76.00   77.09  54.11 1.00 147.00 146.00
## Embarked*     12 889   2.54   0.79   3.00    2.67   0.00 1.00   3.00   2.00
##              skew kurtosis   se
## PassengerId  0.00    -1.20 8.62
## Survived     0.48    -1.77 0.02
## Pclass      -0.63    -1.28 0.03
## Name*        0.00    -1.20 8.62
## Sex*        -0.62    -1.62 0.02
## Age          0.39     0.16 0.54
## SibSp        3.68    17.73 0.04
## Parch        2.74     9.69 0.03
## Ticket*      0.00    -1.28 6.73
## Fare         4.77    33.12 1.66
## Cabin*       0.00    -1.19 2.96
## Embarked*   -1.26    -0.23 0.03

There are 12 different variables. The quantitative variables are Age, SibSp, Parch, Fare, and Cabin. The qualitative variables are PassengerId, Survived, PClass, Name, Sex, Ticket, and Embarked.The levels of measurement for PassengerId and Age are nominal, for PassengerId, and interval, for Age.

Which variable has the most missing observations?

which(is.na(train$PassengerId))
## integer(0)
which(is.na(train$Survived))
## integer(0)
which(is.na(train$Pclass))
## integer(0)
which(is.na(train$Name))
## integer(0)
which(is.na(train$Sex))
## integer(0)
which(is.na(train$Age))
##   [1]   6  18  20  27  29  30  32  33  37  43  46  47  48  49  56  65  66  77
##  [19]  78  83  88  96 102 108 110 122 127 129 141 155 159 160 167 169 177 181
##  [37] 182 186 187 197 199 202 215 224 230 236 241 242 251 257 261 265 271 275
##  [55] 278 285 296 299 301 302 304 305 307 325 331 335 336 348 352 355 359 360
##  [73] 365 368 369 376 385 389 410 411 412 414 416 421 426 429 432 445 452 455
##  [91] 458 460 465 467 469 471 476 482 486 491 496 498 503 508 512 518 523 525
## [109] 528 532 534 539 548 553 558 561 564 565 569 574 579 585 590 594 597 599
## [127] 602 603 612 613 614 630 634 640 644 649 651 654 657 668 670 675 681 693
## [145] 698 710 712 719 728 733 739 740 741 761 767 769 774 777 779 784 791 793
## [163] 794 816 826 827 829 833 838 840 847 850 860 864 869 879 889
which(is.na(train$SibSp))
## integer(0)
which(is.na(train$Parch))
## integer(0)
which(is.na(train$Ticket))
## integer(0)
which(is.na(train$Fare))
## integer(0)
which(is.na(train$Cabin))
##   [1]   1   3   5   6   8   9  10  13  14  15  16  17  18  19  20  21  23  25
##  [19]  26  27  29  30  31  33  34  35  36  37  38  39  40  41  42  43  44  45
##  [37]  46  47  48  49  50  51  52  54  57  58  59  60  61  64  65  66  68  69
##  [55]  70  71  72  73  74  75  77  78  79  80  81  82  83  84  85  86  87  88
##  [73]  90  91  92  94  95  96  99 100 101 102 104 105 106 107 108 109 110 112
##  [91] 113 114 115 116 117 118 120 121 122 123 126 127 128 130 131 132 133 134
## [109] 135 136 139 141 142 143 144 145 146 147 148 150 151 153 154 155 156 157
## [127] 158 159 160 161 162 163 164 165 166 168 169 170 172 173 174 176 177 179
## [145] 180 181 182 183 185 187 188 189 190 191 192 193 197 198 199 200 201 202
## [163] 203 204 205 207 208 209 211 212 213 214 215 217 218 220 221 222 223 224
## [181] 226 227 228 229 230 232 233 234 235 236 237 238 239 240 241 242 243 244
## [199] 245 247 248 250 251 254 255 256 257 259 260 261 262 265 266 267 268 271
## [217] 272 273 275 277 278 279 280 281 282 283 284 286 287 288 289 290 291 294
## [235] 295 296 297 301 302 303 305 307 309 313 314 315 316 317 318 321 322 323
## [253] 324 325 327 329 331 334 335 336 339 343 344 345 347 348 349 350 351 353
## [271] 354 355 356 358 359 360 361 362 363 364 365 366 368 369 372 373 374 375
## [289] 376 377 379 380 381 382 383 384 385 386 387 388 389 390 392 393 396 397
## [307] 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 414 415 416
## [325] 417 418 419 420 421 422 423 424 425 426 427 428 429 432 433 434 437 438
## [343] 440 441 442 443 444 445 447 448 449 451 452 455 456 459 460 462 464 465
## [361] 466 467 468 469 470 471 472 473 475 477 478 479 480 481 482 483 484 486
## [379] 489 490 491 492 494 495 496 498 500 501 502 503 504 507 508 509 510 511
## [397] 512 514 515 518 519 520 522 523 525 526 527 529 530 531 532 533 534 535
## [415] 536 538 539 542 543 544 546 547 548 549 550 552 553 554 555 556 558 560
## [433] 561 562 563 564 565 566 567 568 569 570 571 574 575 576 577 579 580 581
## [451] 583 585 587 589 590 591 593 594 595 596 597 598 599 601 602 603 604 605
## [469] 606 607 608 609 611 612 613 614 615 616 617 618 620 621 623 624 625 627
## [487] 629 630 632 634 635 636 637 638 639 640 641 643 644 645 647 649 650 651
## [505] 652 653 654 655 656 657 658 659 661 662 664 665 666 667 668 669 671 673
## [523] 674 675 676 677 678 679 681 683 684 685 686 687 688 689 692 693 694 695
## [541] 696 697 698 703 704 705 706 707 709 710 714 715 719 720 721 722 723 724
## [559] 726 727 728 729 730 732 733 734 735 736 737 739 740 744 745 747 748 750
## [577] 751 753 754 755 756 757 758 759 761 762 763 765 767 768 769 770 771 772
## [595] 774 775 776 778 779 781 784 785 786 787 788 789 791 792 793 794 795 796
## [613] 798 799 800 801 802 804 805 806 808 809 811 812 813 814 815 817 818 819
## [631] 820 822 823 825 826 827 828 829 831 832 833 834 835 837 838 839 841 842
## [649] 843 844 845 846 847 848 849 851 852 853 855 856 857 859 860 861 862 864
## [667] 865 866 867 869 870 871 874 875 876 877 878 879 881 882 883 884 885 886
## [685] 887 889 891
which(is.na(train$Embarked))
## [1]  62 830

The variable with the most missing variables is Cabin.

Question 2

Impute missing observations for Age, SibSp, and Parch with the column median (ordinal, interval, or ratio), or the column mode.

train$Age[is.na(train$Age)]=median(train$Age, na.rm=TRUE)
original_age_data=train$Age
altered_age=train$Age
summary(original_age_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.42   22.00   28.00   29.36   35.00   80.00
summary(train$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.42   22.00   28.00   29.36   35.00   80.00
train$SibSp[is.na(train$SibSp)]=median(train$SibSp, na.rm=TRUE)
original_sibsp_data=train$SibSp
altered_sibsp=train$SipSp
## Warning: Unknown or uninitialised column: `SipSp`.
summary(original_sibsp_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.523   1.000   8.000
summary(train$SibSp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.523   1.000   8.000
train$Parch[is.na(train$Parch)]=median(train$Parch, na.rm=TRUE)
original_parch_data=train$Parch
altered_parch=train$Parch
summary(original_parch_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3816  0.0000  6.0000
summary(train$Parch)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3816  0.0000  6.0000

Question 3

Install the psych package in R. Invoke the package Then provide descriptive statistics for Age, SibSp, and Parch.

library(psych)

View(train$Age)
describe(train$Age)
##    vars   n  mean    sd median trimmed mad  min max range skew kurtosis   se
## X1    1 891 29.36 13.02     28   28.83 8.9 0.42  80 79.58 0.51     0.97 0.44
View(train$SibSp)
describe(train$SibSp)
##    vars   n mean  sd median trimmed mad min max range skew kurtosis   se
## X1    1 891 0.52 1.1      0    0.27   0   0   8     8 3.68    17.73 0.04
View(train$Parch)
describe(train$Parch)
##    vars   n mean   sd median trimmed mad min max range skew kurtosis   se
## X1    1 891 0.38 0.81      0    0.18   0   0   6     6 2.74     9.69 0.03

Question 4

Provide a cross-tabulation of Survived and Sex. What do you notice?

table(train$Survived, train$Sex)
##    
##     female male
##   0     81  468
##   1    233  109

From the data, 0 stands for did not survive, while 1 stands for survived. This proves that more women (double the amount) survived than men. This could be due to the idea of protecting/saving the women and children first by putting them on the first available lifebots, prior to the men boarding.

Question 5

Provide notched boxplots for Survived and Age. What do you notice?

boxplot(train$Age~train$Survived, notch=T, horizontal=T, ylab = "Survived", xlab = "Age", main = "New Age and Survival Rate, where 1= Survived")

print("Survived")
## [1] "Survived"
summary(train$Age[train$Survived==1])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.42   21.00   28.00   28.29   35.00   80.00
print("Passed Away")
## [1] "Passed Away"
summary(train$Age[train$Survived==0])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   23.00   28.00   30.03   35.00   74.00
train$Age= original_age_data
boxplot(train$Age~train$Survived, notch=T, horizontal=T, ylab= "Survived", xlab = "Age", main= "Original Age and Survival Rate, where 1 is Survived")

From the box plots it is seen that there was a large number of children and young adults that survived compared to those whom passed away. There are some outliers in the data that did indicate some child and young adult mortality. This is the complete contrary for the elderly class, where more passed away than survived, this could mean there were more male elders possibly.