#Q1

install.packages("tidyverse", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpwJ7MSh/downloaded_packages
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
mydata <- read_csv("train.csv")
## Rows: 891 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (7): PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(mydata)
## spc_tbl_ [891 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ PassengerId: num [1:891] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : num [1:891] 0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : num [1:891] 3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr [1:891] "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr [1:891] "male" "female" "female" "female" ...
##  $ Age        : num [1:891] 22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : num [1:891] 1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : num [1:891] 0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr [1:891] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num [1:891] 7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr [1:891] NA "C85" NA "C123" ...
##  $ Embarked   : chr [1:891] "S" "C" "S" "S" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   PassengerId = col_double(),
##   ..   Survived = col_double(),
##   ..   Pclass = col_double(),
##   ..   Name = col_character(),
##   ..   Sex = col_character(),
##   ..   Age = col_double(),
##   ..   SibSp = col_double(),
##   ..   Parch = col_double(),
##   ..   Ticket = col_character(),
##   ..   Fare = col_double(),
##   ..   Cabin = col_character(),
##   ..   Embarked = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

1- The variables are: $PassengerId (quantitative), \(Survived (quantitative),\)Pclass (quantitative), $Name(qualitative), $Sex (qualitative), $Age (quantitative), $SibSp (quantitative), $Parch (quantitative), $Ticket (qualitative), $Fare (quantitative), $Cabin (qualitative), Embarked (qualitative).

-PassengerId level of measurement is ratio and discrete. -Age level of measurement is ratio and continuos.

Q1b-

sum(is.na(mydata$PassengerId))
## [1] 0
sum(is.na(mydata$Survived))
## [1] 0
sum(is.na(mydata$Pclass))
## [1] 0
#easier way to do it

?lapply
lapply(mydata, function(x) sum(is.na(x)))
## $PassengerId
## [1] 0
## 
## $Survived
## [1] 0
## 
## $Pclass
## [1] 0
## 
## $Name
## [1] 0
## 
## $Sex
## [1] 0
## 
## $Age
## [1] 177
## 
## $SibSp
## [1] 0
## 
## $Parch
## [1] 0
## 
## $Ticket
## [1] 0
## 
## $Fare
## [1] 0
## 
## $Cabin
## [1] 687
## 
## $Embarked
## [1] 2

Q1b- Cabin is the variable with the most missing observations (missing 687)

Q2-

Creating the missing observations using the median func

?median
mydata$Age[is.na(mydata$Age)] <- median(mydata$Age, na.rm=TRUE)
mydata$SibSp[is.na(mydata$SibSp)] <- median(mydata$SibSp, na.rm=TRUE)
mydata$Parch[is.na(mydata$Parch)] <- median(mydata$Parch, na.rm=TRUE)
mydata
## # A tibble: 891 × 12
##    PassengerId Survived Pclass Name   Sex     Age SibSp Parch Ticket  Fare Cabin
##          <dbl>    <dbl>  <dbl> <chr>  <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr>
##  1           1        0      3 Braun… male     22     1     0 A/5 2…  7.25 <NA> 
##  2           2        1      1 Cumin… fema…    38     1     0 PC 17… 71.3  C85  
##  3           3        1      3 Heikk… fema…    26     0     0 STON/…  7.92 <NA> 
##  4           4        1      1 Futre… fema…    35     1     0 113803 53.1  C123 
##  5           5        0      3 Allen… male     35     0     0 373450  8.05 <NA> 
##  6           6        0      3 Moran… male     28     0     0 330877  8.46 <NA> 
##  7           7        0      1 McCar… male     54     0     0 17463  51.9  E46  
##  8           8        0      3 Palss… male      2     3     1 349909 21.1  <NA> 
##  9           9        1      3 Johns… fema…    27     0     2 347742 11.1  <NA> 
## 10          10        1      2 Nasse… fema…    14     1     0 237736 30.1  <NA> 
## # … with 881 more rows, and 1 more variable: Embarked <chr>

Q3-

install.packages("pscyh", repos = "https://cloud.r-project.org")
## Warning: package 'pscyh' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
describe(mydata$Age)
##    vars   n  mean    sd median trimmed mad  min max range skew kurtosis   se
## X1    1 891 29.36 13.02     28   28.83 8.9 0.42  80 79.58 0.51     0.97 0.44
describe(mydata$SibSp)
##    vars   n mean  sd median trimmed mad min max range skew kurtosis   se
## X1    1 891 0.52 1.1      0    0.27   0   0   8     8 3.68    17.73 0.04
describe(mydata$Parch)
##    vars   n mean   sd median trimmed mad min max range skew kurtosis   se
## X1    1 891 0.38 0.81      0    0.18   0   0   6     6 2.74     9.69 0.03

Q4-

?table
## Help on topic 'table' was found in the following packages:
## 
##   Package               Library
##   vctrs                 /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
##   base                  /Library/Frameworks/R.framework/Resources/library
## 
## 
## Using the first match ...
table(mydata$Survived, mydata$Sex)
##    
##     female male
##   0     81  468
##   1    233  109
install.packages('gmodels', repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpwJ7MSh/downloaded_packages
library(gmodels)
?CrossTable
CrossTable(mydata$Survived,mydata$Sex)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  891 
## 
##  
##                 | mydata$Sex 
## mydata$Survived |    female |      male | Row Total | 
## ----------------|-----------|-----------|-----------|
##               0 |        81 |       468 |       549 | 
##                 |    65.386 |    35.583 |           | 
##                 |     0.148 |     0.852 |     0.616 | 
##                 |     0.258 |     0.811 |           | 
##                 |     0.091 |     0.525 |           | 
## ----------------|-----------|-----------|-----------|
##               1 |       233 |       109 |       342 | 
##                 |   104.962 |    57.120 |           | 
##                 |     0.681 |     0.319 |     0.384 | 
##                 |     0.742 |     0.189 |           | 
##                 |     0.262 |     0.122 |           | 
## ----------------|-----------|-----------|-----------|
##    Column Total |       314 |       577 |       891 | 
##                 |     0.352 |     0.648 |           | 
## ----------------|-----------|-----------|-----------|
## 
## 

-Out of the 577 males only 109 survived (~19%) -Out of the 314 females only 233 survived (~64%)

Q5-

?boxplot
boxplot(mydata$Age~mydata$Survived, col = (c("pink","blue")),
        main = "Distribution of Variables",
        xlab = "Age of passagers", ylab= "Survived Passagers",
        notch=TRUE, horizontal = T)