setwd("C:/Data Science Fundation with R/R-Course-HTML-Notes/R-Course-HTML-Notes/R-for-Data-Science-and-Machine-Learning/Machine Learning with R")
list.files("C:/Data Science Fundation with R/R-Course-HTML-Notes/R-Course-HTML-Notes/R-for-Data-Science-and-Machine-Learning/Machine Learning with R")
[1] "anscombe.png" "Decision Trees and Random Forests.html"
[3] "K-means Clustering.html" "K Nearest Neighbors.html"
[5] "Linear Regression Lecture.html" "Logistic Regression Lecture.html"
[7] "Natural Language Processing.html" "Neural Nets.html"
[9] "student-mat.csv" "Student Performance_Linear Regression_ML.Rmd"
[11] "Support Vector Machines.html" "test.csv"
[13] "Titanic survival factors.nb.html" "Titanic survival factors.Rmd"
[15] "titanic_test.csv" "titanic_train.csv"
Titanic.train <- read.csv('titanic_train.csv', sep = ',')
Titanic.test <- read.csv('test.csv', sep = ',')
head(Titanic.train)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket
1 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171
2 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0 PC 17599
3 3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282
4 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803
5 5 0 3 Allen, Mr. William Henry male 35 0 0 373450
6 6 0 3 Moran, Mr. James male NA 0 0 330877
Fare Cabin Embarked
1 7.2500 S
2 71.2833 C85 C
3 7.9250 S
4 53.1000 C123 S
5 8.0500 S
6 8.4583 Q
library(Amelia)
missmap(Titanic.train, main = "Missing data detective_Titanic Training Data", col = c("red","blue"), legend = F)

library(ggplot2)
library(ggthemes)
ggplot(Titanic.train, aes(Survived )) +geom_bar(aes(fill=factor(Survived)))

ggplot(Titanic.train, aes(Pclass)) + geom_bar(aes(fill=factor(Pclass)), alpha=0.6)

ggplot(Titanic.train, aes(Sex)) + geom_bar(aes(fill=factor(Survived)))

ggplot(Titanic.train, aes(Age)) + geom_histogram(fill="purple", bins = 30, alpha=0.5)
Warning: Removed 177 rows containing non-finite values (stat_bin).

ggplot(Titanic.train, aes(SibSp)) + geom_bar(aes(fill=factor(Pclass)), alpha=0.8)

ggplot(Titanic.train, aes(Fare)) +geom_histogram(fill="green", color="white", alpha=0.7) + theme_few()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Titanic.train, aes(Pclass,Age)) + geom_boxplot(aes(group=Pclass, fill=factor(Pclass)), alpha=0.5) + scale_y_continuous(breaks = seq(min(0),max(80),by=5))
Warning: Removed 177 rows containing non-finite values (stat_boxplot).

impute.age <- function(age, class){
out <- age
for(i in 1:length(age)){
if(is.na(age[i])){
if (class[i] == 1){
out <- 37
} else if (class[i]==2){
out <- 29
} else {
out <- 24
}
}else {
out <- age[i]
}
}
return(out)
}
fix.age <- impute.age(Titanic.train$Age, Titanic.train$Pclass)
Titanic.train$Age <- fix.age
library(Amelia)
missmap(Titanic.train, main = "Missing data detective_Titanic Training Data", col = c("red","blue"), legend = F)

str(Titanic.train)
'data.frame': 891 obs. of 12 variables:
$ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
$ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
$ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
$ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
$ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
$ Age : num 32 32 32 32 32 32 32 32 32 32 ...
$ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
$ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
$ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
$ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
$ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
$ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
library(dplyr)
Titanic.train.N <- select(Titanic.train, -PassengerId, -Name, -Ticket, -Cabin)
head(Titanic.train.N, 4)
Survived Pclass Sex Age SibSp Parch Fare Embarked
1 0 3 male 32 1 0 7.2500 S
2 1 1 female 32 1 0 71.2833 C
3 1 3 female 32 0 0 7.9250 S
4 1 1 female 32 1 0 53.1000 S
str(Titanic.train.N)
'data.frame': 891 obs. of 8 variables:
$ Survived: int 0 1 1 1 0 0 0 0 1 1 ...
$ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
$ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
$ Age : num 32 32 32 32 32 32 32 32 32 32 ...
$ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
$ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
$ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
$ Embarked: Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
Titanic.train.N$Survived <- factor(Titanic.train.N$Survived)
Titanic.train.N$Pclass <- factor(Titanic.train.N$Pclass)
Titanic.train.N$Parch <- factor(Titanic.train.N$Parch)
Titanic.train.N$SibSp <- factor(Titanic.train.N$SibSp)
str(Titanic.train.N)
'data.frame': 891 obs. of 8 variables:
$ Survived: Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
$ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
$ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
$ Age : num 32 32 32 32 32 32 32 32 32 32 ...
$ SibSp : Factor w/ 7 levels "0","1","2","3",..: 2 2 1 2 1 1 1 4 1 2 ...
$ Parch : Factor w/ 7 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 3 1 ...
$ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
$ Embarked: Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
Titanic.Logistic <- glm(formula = Survived ~ ., family =binomial(link='logit'), data = Titanic.train.N)
summary(Titanic.Logistic)
Call:
glm(formula = Survived ~ ., family = binomial(link = "logit"),
data = Titanic.train.N)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.4036 -0.6636 -0.4363 0.5838 2.5049
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.634e+01 1.697e+03 0.010 0.99232
Pclass2 -6.387e-01 2.882e-01 -2.216 0.02669 *
Pclass3 -1.533e+00 2.749e-01 -5.576 2.46e-08 ***
Sexmale -2.662e+00 1.996e-01 -13.336 < 2e-16 ***
Age NA NA NA NA
SibSp1 1.177e-01 2.213e-01 0.532 0.59485
SibSp2 -7.731e-02 5.158e-01 -0.150 0.88085
SibSp3 -1.997e+00 7.118e-01 -2.806 0.00501 **
SibSp4 -1.270e+00 7.641e-01 -1.662 0.09644 .
SibSp5 -1.583e+01 9.435e+02 -0.017 0.98662
SibSp8 -1.639e+01 7.544e+02 -0.022 0.98267
Parch1 5.491e-01 2.832e-01 1.939 0.05253 .
Parch2 4.146e-01 3.680e-01 1.126 0.25997
Parch3 6.234e-03 1.086e+00 0.006 0.99542
Parch4 -1.642e+01 1.067e+03 -0.015 0.98772
Parch5 -1.703e+00 1.166e+00 -1.461 0.14396
Parch6 -1.715e+01 2.400e+03 -0.007 0.99430
Fare 2.844e-03 2.541e-03 1.119 0.26312
EmbarkedC -1.411e+01 1.697e+03 -0.008 0.99336
EmbarkedQ -1.413e+01 1.697e+03 -0.008 0.99336
EmbarkedS -1.447e+01 1.697e+03 -0.009 0.99320
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1186.66 on 890 degrees of freedom
Residual deviance: 785.79 on 871 degrees of freedom
AIC: 825.79
Number of Fisher Scoring iterations: 15
LS0tDQp0aXRsZTogIlRpdGFuaWMgc3Vydml2ZWRfTG9naXN0aWMgcmVncmVzc2lvbiINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KQXV0aG9yOiBaaHVhbmdmYW5nIFlpLCBodHRwczovL2dlb3lpLm9yZy4NCi0tLQ0KDQoNCmBgYHtyfQ0Kc2V0d2QoIkM6L0RhdGEgU2NpZW5jZSBGdW5kYXRpb24gd2l0aCBSL1ItQ291cnNlLUhUTUwtTm90ZXMvUi1Db3Vyc2UtSFRNTC1Ob3Rlcy9SLWZvci1EYXRhLVNjaWVuY2UtYW5kLU1hY2hpbmUtTGVhcm5pbmcvTWFjaGluZSBMZWFybmluZyB3aXRoIFIiKQ0KbGlzdC5maWxlcygiQzovRGF0YSBTY2llbmNlIEZ1bmRhdGlvbiB3aXRoIFIvUi1Db3Vyc2UtSFRNTC1Ob3Rlcy9SLUNvdXJzZS1IVE1MLU5vdGVzL1ItZm9yLURhdGEtU2NpZW5jZS1hbmQtTWFjaGluZS1MZWFybmluZy9NYWNoaW5lIExlYXJuaW5nIHdpdGggUiIpDQpUaXRhbmljLnRyYWluIDwtIHJlYWQuY3N2KCd0aXRhbmljX3RyYWluLmNzdicsIHNlcCA9ICcsJykNClRpdGFuaWMudGVzdCA8LSByZWFkLmNzdigndGVzdC5jc3YnLCBzZXAgPSAnLCcpDQpoZWFkKFRpdGFuaWMudHJhaW4pDQpgYGANCg0KYGBge3J9DQpsaWJyYXJ5KEFtZWxpYSkNCm1pc3NtYXAoVGl0YW5pYy50cmFpbiwgbWFpbiA9ICJNaXNzaW5nIGRhdGEgZGV0ZWN0aXZlX1RpdGFuaWMgVHJhaW5pbmcgRGF0YSIsIGNvbCA9IGMoInJlZCIsImJsdWUiKSwgbGVnZW5kID0gRikNCmBgYA0KDQpgYGB7cn0NCmxpYnJhcnkoZ2dwbG90MikNCmxpYnJhcnkoZ2d0aGVtZXMpDQpnZ3Bsb3QoVGl0YW5pYy50cmFpbiwgYWVzKFN1cnZpdmVkICkpICtnZW9tX2JhcihhZXMoZmlsbD1mYWN0b3IoU3Vydml2ZWQpKSkNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChUaXRhbmljLnRyYWluLCBhZXMoUGNsYXNzKSkgKyBnZW9tX2JhcihhZXMoZmlsbD1mYWN0b3IoUGNsYXNzKSksIGFscGhhPTAuNikNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChUaXRhbmljLnRyYWluLCBhZXMoU2V4KSkgKyBnZW9tX2JhcihhZXMoZmlsbD1mYWN0b3IoU3Vydml2ZWQpKSkNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChUaXRhbmljLnRyYWluLCBhZXMoQWdlKSkgKyBnZW9tX2hpc3RvZ3JhbShmaWxsPSJwdXJwbGUiLCBiaW5zID0gMzAsIGFscGhhPTAuNSkNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChUaXRhbmljLnRyYWluLCBhZXMoU2liU3ApKSArIGdlb21fYmFyKGFlcyhmaWxsPWZhY3RvcihQY2xhc3MpKSwgYWxwaGE9MC44KQ0KYGBgDQpgYGB7cn0NCmdncGxvdChUaXRhbmljLnRyYWluLCBhZXMoRmFyZSkpICtnZW9tX2hpc3RvZ3JhbShmaWxsPSJncmVlbiIsIGNvbG9yPSJ3aGl0ZSIsIGFscGhhPTAuNykgKyB0aGVtZV9mZXcoKQ0KYGBgDQpgYGB7cn0NCmdncGxvdChUaXRhbmljLnRyYWluLCBhZXMoUGNsYXNzLEFnZSkpICsgZ2VvbV9ib3hwbG90KGFlcyhncm91cD1QY2xhc3MsIGZpbGw9ZmFjdG9yKFBjbGFzcykpLCBhbHBoYT0wLjUpICsgc2NhbGVfeV9jb250aW51b3VzKGJyZWFrcyA9IHNlcShtaW4oMCksbWF4KDgwKSxieT01KSkNCmBgYA0KYGBge3J9DQppbXB1dGUuYWdlIDwtIGZ1bmN0aW9uKGFnZSwgY2xhc3Mpew0KICBvdXQgPC0gYWdlDQogIGZvcihpIGluIDE6bGVuZ3RoKGFnZSkpew0KICAgIGlmKGlzLm5hKGFnZVtpXSkpew0KICAgICAgaWYgKGNsYXNzW2ldID09IDEpew0KICAgICAgICBvdXQgPC0gMzcNCiAgICAgIH0gZWxzZSBpZiAoY2xhc3NbaV09PTIpew0KICAgICAgICBvdXQgPC0gMjkNCiAgICAgIH0gZWxzZSB7DQogICAgICAgIG91dCA8LSAyNA0KICAgICAgfQ0KICAgIH1lbHNlIHsNCiAgICAgIG91dCA8LSBhZ2VbaV0NCiAgICB9DQogIH0NCiAgcmV0dXJuKG91dCkNCn0NCmZpeC5hZ2UgPC0gaW1wdXRlLmFnZShUaXRhbmljLnRyYWluJEFnZSwgVGl0YW5pYy50cmFpbiRQY2xhc3MpDQpUaXRhbmljLnRyYWluJEFnZSA8LSBmaXguYWdlDQpgYGANCg0KYGBge3J9DQpsaWJyYXJ5KEFtZWxpYSkNCm1pc3NtYXAoVGl0YW5pYy50cmFpbiwgbWFpbiA9ICJNaXNzaW5nIGRhdGEgZGV0ZWN0aXZlX1RpdGFuaWMgVHJhaW5pbmcgRGF0YSIsIGNvbCA9IGMoInJlZCIsImJsdWUiKSwgbGVnZW5kID0gRikNCmBgYA0KYGBge3J9DQpzdHIoVGl0YW5pYy50cmFpbikNCmBgYA0KYGBge3J9DQpsaWJyYXJ5KGRwbHlyKQ0KVGl0YW5pYy50cmFpbi5OIDwtIHNlbGVjdChUaXRhbmljLnRyYWluLCAtUGFzc2VuZ2VySWQsIC1OYW1lLCAtVGlja2V0LCAtQ2FiaW4pDQpoZWFkKFRpdGFuaWMudHJhaW4uTiwgNCkNCnN0cihUaXRhbmljLnRyYWluLk4pDQpgYGANCmBgYHtyfQ0KVGl0YW5pYy50cmFpbi5OJFN1cnZpdmVkIDwtIGZhY3RvcihUaXRhbmljLnRyYWluLk4kU3Vydml2ZWQpDQpUaXRhbmljLnRyYWluLk4kUGNsYXNzIDwtIGZhY3RvcihUaXRhbmljLnRyYWluLk4kUGNsYXNzKQ0KVGl0YW5pYy50cmFpbi5OJFBhcmNoIDwtIGZhY3RvcihUaXRhbmljLnRyYWluLk4kUGFyY2gpDQpUaXRhbmljLnRyYWluLk4kU2liU3AgPC0gZmFjdG9yKFRpdGFuaWMudHJhaW4uTiRTaWJTcCkNCnN0cihUaXRhbmljLnRyYWluLk4pDQpgYGANCg0KYGBge3J9DQpUaXRhbmljLkxvZ2lzdGljIDwtIGdsbShmb3JtdWxhID0gU3Vydml2ZWQgfiAuLCBmYW1pbHkgPWJpbm9taWFsKGxpbms9J2xvZ2l0JyksIGRhdGEgPSBUaXRhbmljLnRyYWluLk4pDQpzdW1tYXJ5KFRpdGFuaWMuTG9naXN0aWMpDQpgYGANCg0KDQoNCg==