library(ggplot2)
library(caret)
library(randomForest)
library(e1071)
library(grid)
library(gridExtra)
library(mice)
library(dplyr)
library(pscl)
library(knncat)
train <- read.csv("C:/Data Science/R/Exercises/Titanic/trainingdataset.csv",na.strings = c("N/A","DIV/0!",""), stringsAsFactors = FALSE)
test <- read.csv("C:/Data Science/R/Exercises/Titanic/testingdataset.csv",na.strings = c("N/A","DIV/0!",""),stringsAsFactors = FALSE)
training <- bind_rows(train,test)
summary(training)
## PassengerId Survived Pclass Name
## Min. : 1 Min. :0.0000 Min. :1.000 Length:1309
## 1st Qu.: 328 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median : 655 Median :0.0000 Median :3.000 Mode :character
## Mean : 655 Mean :0.3838 Mean :2.295
## 3rd Qu.: 982 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :1309 Max. :1.0000 Max. :3.000
## NA's :418
## Sex Age SibSp Parch
## Length:1309 Min. : 0.17 Min. :0.0000 Min. :0.000
## Class :character 1st Qu.:21.00 1st Qu.:0.0000 1st Qu.:0.000
## Mode :character Median :28.00 Median :0.0000 Median :0.000
## Mean :29.88 Mean :0.4989 Mean :0.385
## 3rd Qu.:39.00 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :80.00 Max. :8.0000 Max. :9.000
## NA's :263
## Ticket Fare Cabin
## Length:1309 Min. : 0.000 Length:1309
## Class :character 1st Qu.: 7.896 Class :character
## Mode :character Median : 14.454 Mode :character
## Mean : 33.295
## 3rd Qu.: 31.275
## Max. :512.329
## NA's :1
## Embarked
## Length:1309
## Class :character
## Mode :character
##
##
##
##
dim(training)
## [1] 1309 12
str(training)
## 'data.frame': 1309 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr NA "C85" NA "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
training[training==""] <- NA
a <- apply(training,2,is.na)
summary(a)
## PassengerId Survived Pclass Name
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:1309 FALSE:891 FALSE:1309 FALSE:1309
## NA's :0 TRUE :418 NA's :0 NA's :0
## NA's :0
## Sex Age SibSp Parch
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:1309 FALSE:1046 FALSE:1309 FALSE:1309
## NA's :0 TRUE :263 NA's :0 NA's :0
## NA's :0
## Ticket Fare Cabin Embarked
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:1309 FALSE:1308 FALSE:295 FALSE:1307
## NA's :0 TRUE :1 TRUE :1014 TRUE :2
## NA's :0 NA's :0 NA's :0
apply(a,2,sum)
## PassengerId Survived Pclass Name Sex Age
## 0 418 0 0 0 263
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 1 1014 2
training$Salutation <- gsub('(.*, )|(\\..*)', '',training$Name)
table(training$Sex,training$Salutation)
##
## Capt Col Don Dona Dr Jonkheer Lady Major Master Miss Mlle Mme
## female 0 0 0 1 1 0 1 0 0 260 2 1
## male 1 4 1 0 7 1 0 2 61 0 0 0
##
## Mr Mrs Ms Rev Sir the Countess
## female 0 197 2 0 0 1
## male 757 0 0 8 1 0
misc <- c("Capt","Col","Don","Dr","Jonkheer","Lady","Major","Rev","Sir","the Countess","Dona")
training$Salutation[training$Salutation == "Mlle"] <- "Miss"
training$Salutation[training$Salutation == "Mme"] <- "Miss"
training$Salutation[training$Salutation %in% misc] <- "Misc"
table(training$Sex,training$Salutation)
##
## Master Misc Miss Mr Mrs Ms
## female 0 4 263 0 197 2
## male 61 25 0 757 0 0
training$Surname <- sapply(training$Name,function(x) strsplit(x, split = '[,.]')[[1]][1])
s <- nlevels(factor(training$Surname))
paste('We have', s, 'unique surnames in the training dataset amongst',nrow(training), 'passangers.')
## [1] "We have 875 unique surnames in the training dataset amongst 1309 passangers."
training$FamilySize <- training$SibSp + training$Parch + 1
training$Family <- paste(training$Surname,'-',training$familysize)
training$Deck <- substr(training$Cabin,1,1)
paste("Titanic has", nlevels(factor(training$Deck)),"decks on the ship.")
## [1] "Titanic has 8 decks on the ship."
trainingreengineered <- training[,c("Survived","Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Salutation","Deck","FamilySize","Family")]
list <- c("Survived","Pclass","Sex","Embarked","Salutation","Deck","Family")
trainingreengineered[list] <- lapply(trainingreengineered[list],function(x) as.factor(x))
str(trainingreengineered)
## 'data.frame': 1309 obs. of 12 variables:
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
## $ Salutation: Factor w/ 6 levels "Master","Misc",..: 4 5 3 5 4 4 4 1 5 5 ...
## $ Deck : Factor w/ 8 levels "A","B","C","D",..: NA 3 NA 3 NA NA 5 NA NA NA ...
## $ FamilySize: num 2 2 1 2 1 1 1 5 3 2 ...
## $ Family : Factor w/ 875 levels "Abbing - ","Abbott - ",..: 101 183 335 273 16 544 506 614 388 565 ...
set.seed(6)
imp = mice(trainingreengineered, method = "rf", m=5)
##
## iter imp variable
## 1 1 Survived Age Fare Embarked Deck
## 1 2 Survived Age Fare Embarked Deck
## 1 3 Survived Age Fare Embarked Deck
## 1 4 Survived Age Fare Embarked Deck
## 1 5 Survived Age Fare Embarked Deck
## 2 1 Survived Age Fare Embarked Deck
## 2 2 Survived Age Fare Embarked Deck
## 2 3 Survived Age Fare Embarked Deck
## 2 4 Survived Age Fare Embarked Deck
## 2 5 Survived Age Fare Embarked Deck
## 3 1 Survived Age Fare Embarked Deck
## 3 2 Survived Age Fare Embarked Deck
## 3 3 Survived Age Fare Embarked Deck
## 3 4 Survived Age Fare Embarked Deck
## 3 5 Survived Age Fare Embarked Deck
## 4 1 Survived Age Fare Embarked Deck
## 4 2 Survived Age Fare Embarked Deck
## 4 3 Survived Age Fare Embarked Deck
## 4 4 Survived Age Fare Embarked Deck
## 4 5 Survived Age Fare Embarked Deck
## 5 1 Survived Age Fare Embarked Deck
## 5 2 Survived Age Fare Embarked Deck
## 5 3 Survived Age Fare Embarked Deck
## 5 4 Survived Age Fare Embarked Deck
## 5 5 Survived Age Fare Embarked Deck
imputedtrainingreengineered = complete(imp)
summary(imp)
## Multiply imputed data set
## Call:
## mice(data = trainingreengineered, m = 5, method = "rf")
## Number of multiple imputations: 5
## Missing cells per column:
## Survived Pclass Sex Age SibSp Parch
## 418 0 0 263 0 0
## Fare Embarked Salutation Deck FamilySize Family
## 1 2 0 1014 0 0
## Imputation methods:
## Survived Pclass Sex Age SibSp Parch
## "rf" "rf" "rf" "rf" "rf" "rf"
## Fare Embarked Salutation Deck FamilySize Family
## "rf" "rf" "rf" "rf" "rf" "rf"
## VisitSequence:
## Survived Age Fare Embarked Deck
## 1 4 7 8 10
## PredictorMatrix:
## Survived Pclass Sex Age SibSp Parch Fare Embarked Salutation
## Survived 0 1 1 1 1 1 1 1 1
## Pclass 0 0 0 0 0 0 0 0 0
## Sex 0 0 0 0 0 0 0 0 0
## Age 1 1 1 0 1 1 1 1 1
## SibSp 0 0 0 0 0 0 0 0 0
## Parch 0 0 0 0 0 0 0 0 0
## Fare 1 1 1 1 1 1 0 1 1
## Embarked 1 1 1 1 1 1 1 0 1
## Salutation 0 0 0 0 0 0 0 0 0
## Deck 1 1 1 1 1 1 1 1 1
## FamilySize 0 0 0 0 0 0 0 0 0
## Family 0 0 0 0 0 0 0 0 0
## Deck FamilySize Family
## Survived 1 1 1
## Pclass 0 0 0
## Sex 0 0 0
## Age 1 1 1
## SibSp 0 0 0
## Parch 0 0 0
## Fare 1 1 1
## Embarked 1 1 1
## Salutation 0 0 0
## Deck 0 1 1
## FamilySize 0 0 0
## Family 0 0 0
## Random generator seed value: NA
apply(apply(imputedtrainingreengineered,2,is.na),2,sum)
## Survived Pclass Sex Age SibSp Parch
## 0 0 0 0 0 0
## Fare Embarked Salutation Deck FamilySize Family
## 0 0 0 0 0 0
par(mfrow=c(1,2))
hist(trainingreengineered$Fare, main = "Before Imputation", col = "violet")
hist(imputedtrainingreengineered$Fare, main = "Post Imputation", col = "blue")
#### As distribution pattern remains identical before and post imputation, quality of imputation is excellent.
ggplot(imputedtrainingreengineered,aes(x=FamilySize, fill = factor(Survived))) + geom_bar(stat = "count", position = "dodge") + scale_x_continuous(breaks = c(1:11)) + labs( x= "Familysize")
'Single' -> imputedtrainingreengineered$FamilySizeC[imputedtrainingreengineered$FamilySize == 1]
'Small' -> imputedtrainingreengineered$FamilySizeC[imputedtrainingreengineered$FamilySize > 1 & imputedtrainingreengineered$FamilySize < 5]
'Large' -> imputedtrainingreengineered$FamilySizeC[imputedtrainingreengineered$FamilySize >4]
mosaicplot(table(imputedtrainingreengineered$FamilySizeC,imputedtrainingreengineered$Survived), main = "Survival by Family Size", shade = TRUE)
ggplot(imputedtrainingreengineered,aes(x = Age, fill = factor(Survived))) + geom_histogram()+ facet_grid(.~Sex)
table(imputedtrainingreengineered$Sex,imputedtrainingreengineered$Survived)
##
## 0 1
## female 133 333
## male 676 167
imputedtrainingreengineered$Child[imputedtrainingreengineered$Age < 18] <- 'Child'
imputedtrainingreengineered$Child[imputedtrainingreengineered$Age>= 18] <- 'Adult'
table(imputedtrainingreengineered$Child,imputedtrainingreengineered$Survived)
##
## 0 1
## Adult 714 412
## Child 95 88
ggplot(imputedtrainingreengineered,aes(x = Age, fill = factor(Survived))) + geom_bar(stat = "count")+ facet_grid(.~Child)
imputedtrainingreengineered$Mother <- 'Not Mother'
imputedtrainingreengineered$Mother[imputedtrainingreengineered$Sex == 'female'& imputedtrainingreengineered$Age > 18 & imputedtrainingreengineered$Parch > 0 & imputedtrainingreengineered$Salutation != 'Miss'] <- 'Mother'
table(imputedtrainingreengineered$Mother,imputedtrainingreengineered$Survived)
##
## 0 1
## Mother 24 61
## Not Mother 785 439
list1 <- c("Child","Mother")
imputedtrainingreengineered[list1] <- lapply(imputedtrainingreengineered[list1],function(x) as.factor(x))
variables <- c("Pclass","Sex","Age","SibSp","Parch","Fare","Embarked","Salutation","Deck","FamilySize","Child","Mother","Survived")
imputedtrainingreengineered <- imputedtrainingreengineered[,variables]
str(imputedtrainingreengineered)
## 'data.frame': 1309 obs. of 13 variables:
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
## ..- attr(*, "contrasts")= num [1:3, 1:2] 0 1 0 0 0 1
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr "1" "2" "3"
## .. .. ..$ : chr "2" "3"
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## ..- attr(*, "contrasts")= num [1:2, 1] 0 1
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr "female" "male"
## .. .. ..$ : chr "2"
## $ Age : num 22 38 26 35 35 19 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
## ..- attr(*, "contrasts")= num [1:3, 1:2] 0 1 0 0 0 1
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr "C" "Q" "S"
## .. .. ..$ : chr "2" "3"
## $ Salutation: Factor w/ 6 levels "Master","Misc",..: 4 5 3 5 4 4 4 1 5 5 ...
## ..- attr(*, "contrasts")= num [1:6, 1:5] 0 1 0 0 0 0 0 0 1 0 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr "Master" "Misc" "Miss" "Mr" ...
## .. .. ..$ : chr "2" "3" "4" "5" ...
## $ Deck : Factor w/ 8 levels "A","B","C","D",..: 6 3 4 3 4 6 5 5 5 2 ...
## ..- attr(*, "contrasts")= num [1:8, 1:7] 0 1 0 0 0 0 0 0 0 0 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr "A" "B" "C" "D" ...
## .. .. ..$ : chr "2" "3" "4" "5" ...
## $ FamilySize: num 2 2 1 2 1 1 1 5 3 2 ...
## $ Child : Factor w/ 2 levels "Adult","Child": 1 1 1 1 1 1 1 2 1 2 ...
## $ Mother : Factor w/ 2 levels "Mother","Not Mother": 2 2 2 2 2 2 2 2 1 2 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## ..- attr(*, "contrasts")= num [1:2, 1] 0 1
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr "0" "1"
## .. .. ..$ : chr "2"
p1 <- ggplot(imputedtrainingreengineered,aes(x = Pclass,fill = factor(Survived)))+geom_bar(stat = "count", position = "stack")
p2 <- ggplot(imputedtrainingreengineered,aes(x = Pclass,fill = factor(Survived)))+geom_bar(position = "fill")+labs(y = "Proportion")
grid.arrange(p1,p2,ncol=2)
par(mfrow = c(1,2))
qplot(Embarked,data = imputedtrainingreengineered,geom = "density",fill = factor(Survived), alpha = I(0.5),xlab = "Embarked", ylab = "Density", main = "Kernel Density Plot for Port of Embarkment And Survival")
ggplot(imputedtrainingreengineered,aes(x = Embarked, fill = factor(Survived))) + geom_bar(position = "fill")+labs(y="Proportion")
ggplot(imputedtrainingreengineered,aes(Survived, fill= factor(Survived))) + geom_bar(stat = "count") + facet_grid(Deck ~Salutation)
tapply(imputedtrainingreengineered$Fare,imputedtrainingreengineered$Pclass,mean)
## 1 2 3
## 87.50899 21.17920 13.29506
qplot(Fare,Pclass,data = imputedtrainingreengineered, geom = c("point","smooth"), method = "lm", formula = y~x, col = Sex, main = "Regression of Fare on Passenger Class By Each Sex", ylab = "Passenger class", xlab = "Fare")
tapply(imputedtrainingreengineered$Fare,imputedtrainingreengineered$Deck,mean)
## A B C D E F G T
## 25.64440 55.14814 50.70856 26.95982 21.10704 13.91100 14.37612 19.79834
qplot(Deck,Fare, data = imputedtrainingreengineered, geom = c("boxplot"), fill = Sex, main = "Fare Per Deck",xlab = "", ylab = "Fare")
coplot(Age~Salutation|Survived,data = imputedtrainingreengineered, panel = panel.smooth,xlab = "Salutations",ylab = "Age",columns = 2)
# H~0~ : There is no difference between the two population means.
# H~1~ : There is difference between the two population means.
t.test(Fare~Survived,data = imputedtrainingreengineered)
##
## Welch Two Sample t-test
##
## data: Fare by Survived
## t = -8.1006, df = 646.65, p-value = 2.73e-15
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -33.23113 -20.26365
## sample estimates:
## mean in group 0 mean in group 1
## 23.05924 49.80663
# H~0~ : There is no difference between the two population means.
# H~1~ : There is difference between the two population means.
t.test(Age~Survived,data = imputedtrainingreengineered)
##
## Welch Two Sample t-test
##
## data: Age by Survived
## t = 0.038893, df = 966.54, p-value = 0.969
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.567218 1.630595
## sample estimates:
## mean in group 0 mean in group 1
## 29.34837 29.31668
# H~0~ : Passenger Class and Survivals are independent.
# H~1~ : Passenger Class and Survivals are not independent.
chisq.test(imputedtrainingreengineered$Pclass,imputedtrainingreengineered$Survived)
##
## Pearson's Chi-squared test
##
## data: imputedtrainingreengineered$Pclass and imputedtrainingreengineered$Survived
## X-squared = 117.28, df = 2, p-value < 2.2e-16
trainingdataset <- imputedtrainingreengineered[c(1:891),]
testingdataset <- imputedtrainingreengineered[c(892:1309),]
model1 <- glm(Survived ~ ., family = binomial(link = 'logit'), data = trainingdataset)
anova(model1, test = 'Chisq')
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Survived
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 890 1186.66
## Pclass 2 103.547 888 1083.11 < 2.2e-16 ***
## Sex 1 256.220 887 826.89 < 2.2e-16 ***
## Age 1 15.417 886 811.47 8.622e-05 ***
## SibSp 1 13.556 885 797.92 0.0002315 ***
## Parch 1 0.417 884 797.50 0.5181887
## Fare 1 1.520 883 795.98 0.2176495
## Embarked 2 3.814 881 792.16 0.1485020
## Salutation 5 62.405 876 729.76 3.866e-12 ***
## Deck 7 13.106 869 716.65 0.0695566 .
## FamilySize 0 0.000 869 716.65
## Child 1 0.711 868 715.94 0.3990952
## Mother 1 0.114 867 715.83 0.7361542
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pR2(model1)
## llh llhNull G2 McFadden r2ML
## -357.9136485 -593.3275684 470.8278399 0.3967689 0.4104680
## r2CU
## 0.5576976
fit1 <- predict(model1,newdata = testingdataset,type = 'response')
fit1 <- ifelse(fit1 > 0.5,1,0)
misclassificationerror <- mean( fit1 != testingdataset$Survived)
print(paste('Accuracy of Logistic Regression Model = ', 1-misclassificationerror))
## [1] "Accuracy of Logistic Regression Model = 0.727272727272727"
model2 <- randomForest(Survived ~ ., data = trainingdataset, importance = TRUE)
model2
##
## Call:
## randomForest(formula = Survived ~ ., data = trainingdataset, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 17.28%
## Confusion matrix:
## 0 1 class.error
## 0 495 54 0.09836066
## 1 100 242 0.29239766
varImpPlot(model2)
fit2 <- predict(model2,newdata = testingdataset)
confusionMatrix(fit2, testingdataset$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 208 63
## 1 52 95
##
## Accuracy : 0.7249
## 95% CI : (0.6794, 0.7672)
## No Information Rate : 0.622
## P-Value [Acc > NIR] : 5.966e-06
##
## Kappa : 0.4068
## Mcnemar's Test P-Value : 0.3511
##
## Sensitivity : 0.8000
## Specificity : 0.6013
## Pos Pred Value : 0.7675
## Neg Pred Value : 0.6463
## Prevalence : 0.6220
## Detection Rate : 0.4976
## Detection Prevalence : 0.6483
## Balanced Accuracy : 0.7006
##
## 'Positive' Class : 0
##
a <- confusionMatrix(fit2, testingdataset$Survived)$overall[1]
print(paste('Accuracy of Random Forest Model = ', a))
## [1] "Accuracy of Random Forest Model = 0.72488038277512"
model3 <- svm(Survived ~ ., data = trainingdataset)
model3
##
## Call:
## svm(formula = Survived ~ ., data = trainingdataset)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.04
##
## Number of Support Vectors: 418
fit3 <- predict(model3, newdata = testingdataset)
confusionMatrix(fit3, testingdataset$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 202 54
## 1 58 104
##
## Accuracy : 0.7321
## 95% CI : (0.6869, 0.7739)
## No Information Rate : 0.622
## P-Value [Acc > NIR] : 1.317e-06
##
## Kappa : 0.433
## Mcnemar's Test P-Value : 0.7768
##
## Sensitivity : 0.7769
## Specificity : 0.6582
## Pos Pred Value : 0.7891
## Neg Pred Value : 0.6420
## Prevalence : 0.6220
## Detection Rate : 0.4833
## Detection Prevalence : 0.6124
## Balanced Accuracy : 0.7176
##
## 'Positive' Class : 0
##
b <- confusionMatrix(fit3, testingdataset$Survived)$overall[1]
print(paste('Accuracy of Support Vector Machine Model = ', b))
## [1] "Accuracy of Support Vector Machine Model = 0.732057416267943"
model4 <- train(Survived ~ .,data = trainingdataset, method = "lda")
model4
## Linear Discriminant Analysis
##
## 891 samples
## 12 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 891, 891, 891, 891, 891, 891, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8173444 0.6098066
##
##
fit4 <- predict(model4, newdata = testingdataset)
confusionMatrix(fit4, testingdataset$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 202 56
## 1 58 102
##
## Accuracy : 0.7273
## 95% CI : (0.6819, 0.7694)
## No Information Rate : 0.622
## P-Value [Acc > NIR] : 3.648e-06
##
## Kappa : 0.4214
## Mcnemar's Test P-Value : 0.9254
##
## Sensitivity : 0.7769
## Specificity : 0.6456
## Pos Pred Value : 0.7829
## Neg Pred Value : 0.6375
## Prevalence : 0.6220
## Detection Rate : 0.4833
## Detection Prevalence : 0.6172
## Balanced Accuracy : 0.7112
##
## 'Positive' Class : 0
##
l <- confusionMatrix(fit4, testingdataset$Survived)$overall[1]
print(paste('Accuracy of Linear Discriminent Accuracy Model = ', l))
## [1] "Accuracy of Linear Discriminent Accuracy Model = 0.727272727272727"
Survived <- knncat(trainingdataset, classcol = 13)
Survived
## Training set misclass rate: 19.75%
fit5 <- predict(Survived,trainingdataset,testingdataset,train.classcol = 13,newdata.classcol = 13)
table(fit5,testingdataset$Survived)
##
## fit5 0 1
## 0 206 60
## 1 54 98
misclassificationerror <- mean( fit5 != testingdataset$Survived)
print(paste('Accuracy of KNN Model = ', 1-misclassificationerror))
## [1] "Accuracy of KNN Model = 0.727272727272727"