#Week 1 Assignment

Let’s Load data

mytraindata<-read.csv(file="C:/R/classwork/train.csv", header=TRUE)
# mytraindata
install.packages("r package", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/rajagopr/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## Warning: package 'r package' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
library("psych")
data(mytraindata)
## Warning in data(mytraindata): data set 'mytraindata' not found
summary(mytraindata)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 
describe(mytraindata)
##             vars   n   mean     sd median trimmed    mad  min    max  range
## PassengerId    1 891 446.00 257.35 446.00  446.00 330.62 1.00 891.00 890.00
## Survived       2 891   0.38   0.49   0.00    0.35   0.00 0.00   1.00   1.00
## Pclass         3 891   2.31   0.84   3.00    2.39   0.00 1.00   3.00   2.00
## Name*          4 891 446.00 257.35 446.00  446.00 330.62 1.00 891.00 890.00
## Sex*           5 891   1.65   0.48   2.00    1.68   0.00 1.00   2.00   1.00
## Age            6 714  29.70  14.53  28.00   29.27  13.34 0.42  80.00  79.58
## SibSp          7 891   0.52   1.10   0.00    0.27   0.00 0.00   8.00   8.00
## Parch          8 891   0.38   0.81   0.00    0.18   0.00 0.00   6.00   6.00
## Ticket*        9 891 339.52 200.83 338.00  339.65 268.35 1.00 681.00 680.00
## Fare          10 891  32.20  49.69  14.45   21.38  10.24 0.00 512.33 512.33
## Cabin*        11 891  18.63  38.14   1.00    8.29   0.00 1.00 148.00 147.00
## Embarked*     12 891   3.53   0.80   4.00    3.66   0.00 1.00   4.00   3.00
##              skew kurtosis   se
## PassengerId  0.00    -1.20 8.62
## Survived     0.48    -1.77 0.02
## Pclass      -0.63    -1.28 0.03
## Name*        0.00    -1.20 8.62
## Sex*        -0.62    -1.62 0.02
## Age          0.39     0.16 0.54
## SibSp        3.68    17.73 0.04
## Parch        2.74     9.69 0.03
## Ticket*      0.00    -1.28 6.73
## Fare         4.77    33.12 1.66
## Cabin*       2.09     3.07 1.28
## Embarked*   -1.27    -0.16 0.03
str(mytraindata)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...
head(mytraindata)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  NA     0     0
##             Ticket    Fare Cabin Embarked
## 1        A/5 21171  7.2500              S
## 2         PC 17599 71.2833   C85        C
## 3 STON/O2. 3101282  7.9250              S
## 4           113803 53.1000  C123        S
## 5           373450  8.0500              S
## 6           330877  8.4583              Q
mean(mytraindata$Fair)
## Warning in mean.default(mytraindata$Fair): argument is not numeric or logical:
## returning NA
## [1] NA
sd(mytraindata$Fair)
## [1] NA
median(mytraindata$Fair)
## NULL
mean(mytraindata$PassengerId)
## [1] 446
sd(mytraindata$PassengerId)
## [1] 257.3538
median(mytraindata$PassengerId)
## [1] 446
#class(mytraindata)
typeof(mytraindata)
## [1] "list"
is.integer(mytraindata$PassengerId)
## [1] TRUE
is.integer(array(mytraindata))
## [1] FALSE

Here is the hist plot of the data

hist(mytraindata$Fare)

hist(mytraindata$Pclass)

hist(mytraindata$Age)

What are the types of variable (quantitative / qualitative) and levels of measurement

Answer is PassengerId is nominal Because it is unique and can be arranged in this case 1,2,3,4,but at the same time you can not order the all the unique ID like APEC001 , APEC1105 etc..

Age is Ordinal - Because you can sort by Age and you can order it by age group also.

#Which variable has the most missing observations

colSums(is.na(mytraindata)) #Age has ome N/A values

sum(is.na(mytraindata$Age)) # N/A values 177

Impute missing observations for Age, SibSp, and Parch with the column median

MytrainAge<-sum(is.na(mytraindata\(Age)) MytrainAge<-median(MytrainAge,na.rm=TRUE) MytrainAge mytraindata\)Age[is.na(mytraindata\(Age)]<-median(mytraindata\)Age,na.rm=TRUE) mytraindata$Age

descriptive statistics for Age, SibSp, and Parch

install.packages(“psych”) library(“psych”) describe(mytraindata\(Age) describe(mytraindata\)SibSp) describe(mytraindata\(Parch) # Descriptive statistics for Age, SibSp, and Parch in one line #describe(mytraindata\)Age,mytraindata\(SibSp,mytraindata\)Parch)

Provide a cross-tabulation of Survived and Sex

table(mytraindata\(Survived,mytraindata\)Sex) #female male # 0 81 468 # 1 233 109

Provide notched boxplots for Survived and Age

boxplot(mytraindata$Survived,mytraindata$Age, notch=TRUE,horizontal=T)
## Warning in (function (z, notch = FALSE, width = NULL, varwidth = FALSE, : some
## notches went outside hinges ('box'): maybe set notch=FALSE

boxplot(mytraindata\(Survived,mytraindata\)Age, notch=TRUE,horizontal=T)

if notch=FALSE

boxplot(mytraindata$Survived,mytraindata$Age, notch=FALSE,horizontal=T)

boxplot(mytraindata$Pclass,mytraindata$Age, notch=TRUE,horizontal=T)
## Warning in (function (z, notch = FALSE, width = NULL, varwidth = FALSE, : some
## notches went outside hinges ('box'): maybe set notch=FALSE