Titanic

# Read CSV file
mydata <- read.csv('./train.csv')

str(mydata)

## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

head(mydata)

##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  NA     0     0
##             Ticket    Fare Cabin Embarked
## 1        A/5 21171  7.2500              S
## 2         PC 17599 71.2833   C85        C
## 3 STON/O2. 3101282  7.9250              S
## 4           113803 53.1000  C123        S
## 5           373450  8.0500              S
## 6           330877  8.4583              Q

summary(mydata)

##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
##

#Q1 a.  What  are the types of variable (quantitative / qualitative) and levels of measurementLinks to an external site. (nominal / ordinal / interval / ratio) for PassengerId and Age?
str(mydata$PassengerId)

##  int [1:891] 1 2 3 4 5 6 7 8 9 10 ...

str(mydata$Age)

##  num [1:891] 22 38 26 35 35 NA 54 2 27 14 ...

# PassengerId is qualitative variable and Age is quantitative variable.
#The levels of measurement for PassengerId is nominal and Age is Interval.

#Q1b.  Which variable has the most missing observations?  You could have Googled for "Count NA values in R for all columns in a dataframeLinks to an external site." or something like that. 
mydata[mydata == ""
      |mydata == " "] <- NA
colSums(is.na(mydata))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0         687           2

# In the data, there are certian blank column which contain in the orginal data. The Cabin section have the most missing observations.

#Q2.  Impute missing observations for Age, SibSp, and Parch with the column median (ordinal, interval, or ratio), or the column mode.  To do so, use something like this:  mydata$Age[is.na(mydata$Age)] <- median(mydata$Age, na.rm=TRUE) . You can read up on indexing in dataframeLinks to an external site. (i.e. "dataframe_name$variable_name").  

summary(mydata$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.42   20.12   28.00   29.70   38.00   80.00     177

summary(mydata$SibSp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.523   1.000   8.000

mydata$Age[is.na(mydata$Age)] <- median(mydata$Age, na.rm=TRUE)
mydata$SibSp[is.na(mydata$SibSp)] <- median(mydata$SibSp, na.rm=TRUE)
mydata$Parch[is.na(mydata$Parch)] <- median(mydata$Parch, na.rm=TRUE)

summary(mydata$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.42   22.00   28.00   29.36   35.00   80.00

summary(mydata$SibSp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.523   1.000   8.000

#Q3.  Install the psych package in R:  install.packages('pscyh').  Invoke the package using library(psych).  Then provide descriptive statistics for Age, SibSp, and Parch (e.g., describe(mydata$Age) .  
library(psych)

describe(mydata$Age)

##    vars   n  mean    sd median trimmed mad  min max range skew kurtosis   se
## X1    1 891 29.36 13.02     28   28.83 8.9 0.42  80 79.58 0.51     0.97 0.44

describe(mydata$SibSp)

##    vars   n mean  sd median trimmed mad min max range skew kurtosis   se
## X1    1 891 0.52 1.1      0    0.27   0   0   8     8 3.68    17.73 0.04

describe(mydata$Parch)

##    vars   n mean   sd median trimmed mad min max range skew kurtosis   se
## X1    1 891 0.38 0.81      0    0.18   0   0   6     6 2.74     9.69 0.03

#Q4.  Provide a cross-tabulation of Survived and Sex (e.g., table(mydata$Survived, mydata$Sex).  What do you notice?
cross_tabulation <- table (mydata$Survived, mydata$Sex)
print (cross_tabulation)

##    
##     female male
##   0     81  468
##   1    233  109

# What I noticed with the code is that the female passengers had higher survived rate than the male passengers

#Q5.  Provide notched boxplots for Survived and Age (e.g., boxplot(mydata$Age~mydata$Survived, notch=TRUE, horizontal=T).  What do  you notice?
boxplot(mydata$Age~mydata$Survived, notch=TRUE, horizontal=T,
        xlab = "Age", ylab = "Survived", main = "Age vs Survived boxplot graph")

# boxplot help me testify my perspective in question 4 is correct which is that the female passengers had higher survived rate than the male passengers