#Q1
install.packages("tidyverse", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpwJ7MSh/downloaded_packages
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
mydata <- read_csv("train.csv")
## Rows: 891 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (7): PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(mydata)
## spc_tbl_ [891 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ PassengerId: num [1:891] 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : num [1:891] 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : num [1:891] 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr [1:891] "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr [1:891] "male" "female" "female" "female" ...
## $ Age : num [1:891] 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : num [1:891] 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : num [1:891] 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr [1:891] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num [1:891] 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr [1:891] NA "C85" NA "C123" ...
## $ Embarked : chr [1:891] "S" "C" "S" "S" ...
## - attr(*, "spec")=
## .. cols(
## .. PassengerId = col_double(),
## .. Survived = col_double(),
## .. Pclass = col_double(),
## .. Name = col_character(),
## .. Sex = col_character(),
## .. Age = col_double(),
## .. SibSp = col_double(),
## .. Parch = col_double(),
## .. Ticket = col_character(),
## .. Fare = col_double(),
## .. Cabin = col_character(),
## .. Embarked = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
-PassengerId level of measurement is ratio and discrete. -Age level of measurement is ratio and continuos.
Q1b-
sum(is.na(mydata$PassengerId))
## [1] 0
sum(is.na(mydata$Survived))
## [1] 0
sum(is.na(mydata$Pclass))
## [1] 0
#easier way to do it
?lapply
lapply(mydata, function(x) sum(is.na(x)))
## $PassengerId
## [1] 0
##
## $Survived
## [1] 0
##
## $Pclass
## [1] 0
##
## $Name
## [1] 0
##
## $Sex
## [1] 0
##
## $Age
## [1] 177
##
## $SibSp
## [1] 0
##
## $Parch
## [1] 0
##
## $Ticket
## [1] 0
##
## $Fare
## [1] 0
##
## $Cabin
## [1] 687
##
## $Embarked
## [1] 2
Q1b- Cabin is the variable with the most missing observations (missing 687)
Q2-
Creating the missing observations using the median func
?median
mydata$Age[is.na(mydata$Age)] <- median(mydata$Age, na.rm=TRUE)
mydata$SibSp[is.na(mydata$SibSp)] <- median(mydata$SibSp, na.rm=TRUE)
mydata$Parch[is.na(mydata$Parch)] <- median(mydata$Parch, na.rm=TRUE)
mydata
## # A tibble: 891 × 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 0 3 Braun… male 22 1 0 A/5 2… 7.25 <NA>
## 2 2 1 1 Cumin… fema… 38 1 0 PC 17… 71.3 C85
## 3 3 1 3 Heikk… fema… 26 0 0 STON/… 7.92 <NA>
## 4 4 1 1 Futre… fema… 35 1 0 113803 53.1 C123
## 5 5 0 3 Allen… male 35 0 0 373450 8.05 <NA>
## 6 6 0 3 Moran… male 28 0 0 330877 8.46 <NA>
## 7 7 0 1 McCar… male 54 0 0 17463 51.9 E46
## 8 8 0 3 Palss… male 2 3 1 349909 21.1 <NA>
## 9 9 1 3 Johns… fema… 27 0 2 347742 11.1 <NA>
## 10 10 1 2 Nasse… fema… 14 1 0 237736 30.1 <NA>
## # … with 881 more rows, and 1 more variable: Embarked <chr>
Q3-
install.packages("pscyh", repos = "https://cloud.r-project.org")
## Warning: package 'pscyh' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describe(mydata$Age)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 891 29.36 13.02 28 28.83 8.9 0.42 80 79.58 0.51 0.97 0.44
describe(mydata$SibSp)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 891 0.52 1.1 0 0.27 0 0 8 8 3.68 17.73 0.04
describe(mydata$Parch)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 891 0.38 0.81 0 0.18 0 0 6 6 2.74 9.69 0.03
Q4-
?table
## Help on topic 'table' was found in the following packages:
##
## Package Library
## vctrs /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
## base /Library/Frameworks/R.framework/Resources/library
##
##
## Using the first match ...
table(mydata$Survived, mydata$Sex)
##
## female male
## 0 81 468
## 1 233 109
install.packages('gmodels', repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/5_/389qrkvs1sd7nkp792bslx5r0000gn/T//RtmpwJ7MSh/downloaded_packages
library(gmodels)
?CrossTable
CrossTable(mydata$Survived,mydata$Sex)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 891
##
##
## | mydata$Sex
## mydata$Survived | female | male | Row Total |
## ----------------|-----------|-----------|-----------|
## 0 | 81 | 468 | 549 |
## | 65.386 | 35.583 | |
## | 0.148 | 0.852 | 0.616 |
## | 0.258 | 0.811 | |
## | 0.091 | 0.525 | |
## ----------------|-----------|-----------|-----------|
## 1 | 233 | 109 | 342 |
## | 104.962 | 57.120 | |
## | 0.681 | 0.319 | 0.384 |
## | 0.742 | 0.189 | |
## | 0.262 | 0.122 | |
## ----------------|-----------|-----------|-----------|
## Column Total | 314 | 577 | 891 |
## | 0.352 | 0.648 | |
## ----------------|-----------|-----------|-----------|
##
##
-Out of the 577 males only 109 survived (~19%) -Out of the 314 females only 233 survived (~64%)
Q5-
?boxplot
boxplot(mydata$Age~mydata$Survived, col = (c("pink","blue")),
main = "Distribution of Variables",
xlab = "Age of passagers", ylab= "Survived Passagers",
notch=TRUE, horizontal = T)