#Week 1 Assignment
mytraindata<-read.csv(file="C:/R/classwork/train.csv", header=TRUE)
# mytraindata
install.packages("r package", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/rajagopr/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## Warning: package 'r package' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
library("psych")
data(mytraindata)
## Warning in data(mytraindata): data set 'mytraindata' not found
summary(mytraindata)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
describe(mytraindata)
## vars n mean sd median trimmed mad min max range
## PassengerId 1 891 446.00 257.35 446.00 446.00 330.62 1.00 891.00 890.00
## Survived 2 891 0.38 0.49 0.00 0.35 0.00 0.00 1.00 1.00
## Pclass 3 891 2.31 0.84 3.00 2.39 0.00 1.00 3.00 2.00
## Name* 4 891 446.00 257.35 446.00 446.00 330.62 1.00 891.00 890.00
## Sex* 5 891 1.65 0.48 2.00 1.68 0.00 1.00 2.00 1.00
## Age 6 714 29.70 14.53 28.00 29.27 13.34 0.42 80.00 79.58
## SibSp 7 891 0.52 1.10 0.00 0.27 0.00 0.00 8.00 8.00
## Parch 8 891 0.38 0.81 0.00 0.18 0.00 0.00 6.00 6.00
## Ticket* 9 891 339.52 200.83 338.00 339.65 268.35 1.00 681.00 680.00
## Fare 10 891 32.20 49.69 14.45 21.38 10.24 0.00 512.33 512.33
## Cabin* 11 891 18.63 38.14 1.00 8.29 0.00 1.00 148.00 147.00
## Embarked* 12 891 3.53 0.80 4.00 3.66 0.00 1.00 4.00 3.00
## skew kurtosis se
## PassengerId 0.00 -1.20 8.62
## Survived 0.48 -1.77 0.02
## Pclass -0.63 -1.28 0.03
## Name* 0.00 -1.20 8.62
## Sex* -0.62 -1.62 0.02
## Age 0.39 0.16 0.54
## SibSp 3.68 17.73 0.04
## Parch 2.74 9.69 0.03
## Ticket* 0.00 -1.28 6.73
## Fare 4.77 33.12 1.66
## Cabin* 2.09 3.07 1.28
## Embarked* -1.27 -0.16 0.03
str(mytraindata)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
head(mytraindata)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male NA 0 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 S
## 6 330877 8.4583 Q
mean(mytraindata$Fair)
## Warning in mean.default(mytraindata$Fair): argument is not numeric or logical:
## returning NA
## [1] NA
sd(mytraindata$Fair)
## [1] NA
median(mytraindata$Fair)
## NULL
mean(mytraindata$PassengerId)
## [1] 446
sd(mytraindata$PassengerId)
## [1] 257.3538
median(mytraindata$PassengerId)
## [1] 446
#class(mytraindata)
typeof(mytraindata)
## [1] "list"
is.integer(mytraindata$PassengerId)
## [1] TRUE
is.integer(array(mytraindata))
## [1] FALSE
Here is the hist plot of the data
hist(mytraindata$Fare)
hist(mytraindata$Pclass)
hist(mytraindata$Age)
Age is Ordinal - Because you can sort by Age and you can order it by age group also.
#Which variable has the most missing observations
colSums(is.na(mytraindata)) #Age has ome N/A values
sum(is.na(mytraindata$Age)) # N/A values 177
MytrainAge<-sum(is.na(mytraindata\(Age)) MytrainAge<-median(MytrainAge,na.rm=TRUE) MytrainAge mytraindata\)Age[is.na(mytraindata\(Age)]<-median(mytraindata\)Age,na.rm=TRUE) mytraindata$Age
install.packages(“psych”) library(“psych”) describe(mytraindata\(Age) describe(mytraindata\)SibSp) describe(mytraindata\(Parch) # Descriptive statistics for Age, SibSp, and Parch in one line #describe(mytraindata\)Age,mytraindata\(SibSp,mytraindata\)Parch)
table(mytraindata\(Survived,mytraindata\)Sex) #female male # 0 81 468 # 1 233 109
boxplot(mytraindata$Survived,mytraindata$Age, notch=TRUE,horizontal=T)
## Warning in (function (z, notch = FALSE, width = NULL, varwidth = FALSE, : some
## notches went outside hinges ('box'): maybe set notch=FALSE
boxplot(mytraindata$Survived,mytraindata$Age, notch=FALSE,horizontal=T)
boxplot(mytraindata$Pclass,mytraindata$Age, notch=TRUE,horizontal=T)
## Warning in (function (z, notch = FALSE, width = NULL, varwidth = FALSE, : some
## notches went outside hinges ('box'): maybe set notch=FALSE