setwd("C:/Data Science Fundation with R/R-Course-HTML-Notes/R-Course-HTML-Notes/R-for-Data-Science-and-Machine-Learning/Machine Learning with R")
list.files("C:/Data Science Fundation with R/R-Course-HTML-Notes/R-Course-HTML-Notes/R-for-Data-Science-and-Machine-Learning/Machine Learning with R")
 [1] "anscombe.png"                                 "Decision Trees and Random Forests.html"      
 [3] "K-means Clustering.html"                      "K Nearest Neighbors.html"                    
 [5] "Linear Regression Lecture.html"               "Logistic Regression Lecture.html"            
 [7] "Natural Language Processing.html"             "Neural Nets.html"                            
 [9] "student-mat.csv"                              "Student Performance_Linear Regression_ML.Rmd"
[11] "Support Vector Machines.html"                 "test.csv"                                    
[13] "Titanic survival factors.nb.html"             "Titanic survival factors.Rmd"                
[15] "titanic_test.csv"                             "titanic_train.csv"                           
Titanic.train <- read.csv('titanic_train.csv', sep = ',')
Titanic.test <- read.csv('test.csv', sep = ',')
head(Titanic.train)
  PassengerId Survived Pclass                                                Name    Sex Age SibSp Parch           Ticket
1           1        0      3                             Braund, Mr. Owen Harris   male  22     1     0        A/5 21171
2           2        1      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0         PC 17599
3           3        1      3                              Heikkinen, Miss. Laina female  26     0     0 STON/O2. 3101282
4           4        1      1        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0           113803
5           5        0      3                            Allen, Mr. William Henry   male  35     0     0           373450
6           6        0      3                                    Moran, Mr. James   male  NA     0     0           330877
     Fare Cabin Embarked
1  7.2500              S
2 71.2833   C85        C
3  7.9250              S
4 53.1000  C123        S
5  8.0500              S
6  8.4583              Q
library(Amelia)
missmap(Titanic.train, main = "Missing data detective_Titanic Training Data", col = c("red","blue"), legend = F)

library(ggplot2)
library(ggthemes)
ggplot(Titanic.train, aes(Survived )) +geom_bar(aes(fill=factor(Survived)))

ggplot(Titanic.train, aes(Pclass)) + geom_bar(aes(fill=factor(Pclass)), alpha=0.6)

ggplot(Titanic.train, aes(Sex)) + geom_bar(aes(fill=factor(Survived)))

ggplot(Titanic.train, aes(Age)) + geom_histogram(fill="purple", bins = 30, alpha=0.5)
Warning: Removed 177 rows containing non-finite values (stat_bin).

ggplot(Titanic.train, aes(SibSp)) + geom_bar(aes(fill=factor(Pclass)), alpha=0.8)

ggplot(Titanic.train, aes(Fare)) +geom_histogram(fill="green", color="white", alpha=0.7) + theme_few()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(Titanic.train, aes(Pclass,Age)) + geom_boxplot(aes(group=Pclass, fill=factor(Pclass)), alpha=0.5) + scale_y_continuous(breaks = seq(min(0),max(80),by=5))
Warning: Removed 177 rows containing non-finite values (stat_boxplot).

impute.age <- function(age, class){
  out <- age
  for(i in 1:length(age)){
    if(is.na(age[i])){
      if (class[i] == 1){
        out <- 37
      } else if (class[i]==2){
        out <- 29
      } else {
        out <- 24
      }
    }else {
      out <- age[i]
    }
  }
  return(out)
}
fix.age <- impute.age(Titanic.train$Age, Titanic.train$Pclass)
Titanic.train$Age <- fix.age
library(Amelia)
missmap(Titanic.train, main = "Missing data detective_Titanic Training Data", col = c("red","blue"), legend = F)

str(Titanic.train)
'data.frame':   891 obs. of  12 variables:
 $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
 $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
 $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
 $ Name       : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
 $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
 $ Age        : num  32 32 32 32 32 32 32 32 32 32 ...
 $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
 $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
 $ Ticket     : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
 $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
 $ Cabin      : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
 $ Embarked   : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
library(dplyr)
Titanic.train.N <- select(Titanic.train, -PassengerId, -Name, -Ticket, -Cabin)
head(Titanic.train.N, 4)
  Survived Pclass    Sex Age SibSp Parch    Fare Embarked
1        0      3   male  32     1     0  7.2500        S
2        1      1 female  32     1     0 71.2833        C
3        1      3 female  32     0     0  7.9250        S
4        1      1 female  32     1     0 53.1000        S
str(Titanic.train.N)
'data.frame':   891 obs. of  8 variables:
 $ Survived: int  0 1 1 1 0 0 0 0 1 1 ...
 $ Pclass  : int  3 1 3 1 3 3 1 3 3 2 ...
 $ Sex     : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
 $ Age     : num  32 32 32 32 32 32 32 32 32 32 ...
 $ SibSp   : int  1 1 0 1 0 0 0 3 0 1 ...
 $ Parch   : int  0 0 0 0 0 0 0 1 2 0 ...
 $ Fare    : num  7.25 71.28 7.92 53.1 8.05 ...
 $ Embarked: Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
Titanic.train.N$Survived <- factor(Titanic.train.N$Survived)
Titanic.train.N$Pclass <- factor(Titanic.train.N$Pclass)
Titanic.train.N$Parch <- factor(Titanic.train.N$Parch)
Titanic.train.N$SibSp <- factor(Titanic.train.N$SibSp)
str(Titanic.train.N)
'data.frame':   891 obs. of  8 variables:
 $ Survived: Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
 $ Pclass  : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
 $ Sex     : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
 $ Age     : num  32 32 32 32 32 32 32 32 32 32 ...
 $ SibSp   : Factor w/ 7 levels "0","1","2","3",..: 2 2 1 2 1 1 1 4 1 2 ...
 $ Parch   : Factor w/ 7 levels "0","1","2","3",..: 1 1 1 1 1 1 1 2 3 1 ...
 $ Fare    : num  7.25 71.28 7.92 53.1 8.05 ...
 $ Embarked: Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
Titanic.Logistic <- glm(formula = Survived ~ ., family =binomial(link='logit'), data = Titanic.train.N)
summary(Titanic.Logistic)

Call:
glm(formula = Survived ~ ., family = binomial(link = "logit"), 
    data = Titanic.train.N)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.4036  -0.6636  -0.4363   0.5838   2.5049  

Coefficients: (1 not defined because of singularities)
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  1.634e+01  1.697e+03   0.010  0.99232    
Pclass2     -6.387e-01  2.882e-01  -2.216  0.02669 *  
Pclass3     -1.533e+00  2.749e-01  -5.576 2.46e-08 ***
Sexmale     -2.662e+00  1.996e-01 -13.336  < 2e-16 ***
Age                 NA         NA      NA       NA    
SibSp1       1.177e-01  2.213e-01   0.532  0.59485    
SibSp2      -7.731e-02  5.158e-01  -0.150  0.88085    
SibSp3      -1.997e+00  7.118e-01  -2.806  0.00501 ** 
SibSp4      -1.270e+00  7.641e-01  -1.662  0.09644 .  
SibSp5      -1.583e+01  9.435e+02  -0.017  0.98662    
SibSp8      -1.639e+01  7.544e+02  -0.022  0.98267    
Parch1       5.491e-01  2.832e-01   1.939  0.05253 .  
Parch2       4.146e-01  3.680e-01   1.126  0.25997    
Parch3       6.234e-03  1.086e+00   0.006  0.99542    
Parch4      -1.642e+01  1.067e+03  -0.015  0.98772    
Parch5      -1.703e+00  1.166e+00  -1.461  0.14396    
Parch6      -1.715e+01  2.400e+03  -0.007  0.99430    
Fare         2.844e-03  2.541e-03   1.119  0.26312    
EmbarkedC   -1.411e+01  1.697e+03  -0.008  0.99336    
EmbarkedQ   -1.413e+01  1.697e+03  -0.008  0.99336    
EmbarkedS   -1.447e+01  1.697e+03  -0.009  0.99320    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1186.66  on 890  degrees of freedom
Residual deviance:  785.79  on 871  degrees of freedom
AIC: 825.79

Number of Fisher Scoring iterations: 15
LS0tDQp0aXRsZTogIlRpdGFuaWMgc3Vydml2ZWRfTG9naXN0aWMgcmVncmVzc2lvbiINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KQXV0aG9yOiBaaHVhbmdmYW5nIFlpLCBodHRwczovL2dlb3lpLm9yZy4NCi0tLQ0KDQoNCmBgYHtyfQ0Kc2V0d2QoIkM6L0RhdGEgU2NpZW5jZSBGdW5kYXRpb24gd2l0aCBSL1ItQ291cnNlLUhUTUwtTm90ZXMvUi1Db3Vyc2UtSFRNTC1Ob3Rlcy9SLWZvci1EYXRhLVNjaWVuY2UtYW5kLU1hY2hpbmUtTGVhcm5pbmcvTWFjaGluZSBMZWFybmluZyB3aXRoIFIiKQ0KbGlzdC5maWxlcygiQzovRGF0YSBTY2llbmNlIEZ1bmRhdGlvbiB3aXRoIFIvUi1Db3Vyc2UtSFRNTC1Ob3Rlcy9SLUNvdXJzZS1IVE1MLU5vdGVzL1ItZm9yLURhdGEtU2NpZW5jZS1hbmQtTWFjaGluZS1MZWFybmluZy9NYWNoaW5lIExlYXJuaW5nIHdpdGggUiIpDQpUaXRhbmljLnRyYWluIDwtIHJlYWQuY3N2KCd0aXRhbmljX3RyYWluLmNzdicsIHNlcCA9ICcsJykNClRpdGFuaWMudGVzdCA8LSByZWFkLmNzdigndGVzdC5jc3YnLCBzZXAgPSAnLCcpDQpoZWFkKFRpdGFuaWMudHJhaW4pDQpgYGANCg0KYGBge3J9DQpsaWJyYXJ5KEFtZWxpYSkNCm1pc3NtYXAoVGl0YW5pYy50cmFpbiwgbWFpbiA9ICJNaXNzaW5nIGRhdGEgZGV0ZWN0aXZlX1RpdGFuaWMgVHJhaW5pbmcgRGF0YSIsIGNvbCA9IGMoInJlZCIsImJsdWUiKSwgbGVnZW5kID0gRikNCmBgYA0KDQpgYGB7cn0NCmxpYnJhcnkoZ2dwbG90MikNCmxpYnJhcnkoZ2d0aGVtZXMpDQpnZ3Bsb3QoVGl0YW5pYy50cmFpbiwgYWVzKFN1cnZpdmVkICkpICtnZW9tX2JhcihhZXMoZmlsbD1mYWN0b3IoU3Vydml2ZWQpKSkNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChUaXRhbmljLnRyYWluLCBhZXMoUGNsYXNzKSkgKyBnZW9tX2JhcihhZXMoZmlsbD1mYWN0b3IoUGNsYXNzKSksIGFscGhhPTAuNikNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChUaXRhbmljLnRyYWluLCBhZXMoU2V4KSkgKyBnZW9tX2JhcihhZXMoZmlsbD1mYWN0b3IoU3Vydml2ZWQpKSkNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChUaXRhbmljLnRyYWluLCBhZXMoQWdlKSkgKyBnZW9tX2hpc3RvZ3JhbShmaWxsPSJwdXJwbGUiLCBiaW5zID0gMzAsIGFscGhhPTAuNSkNCmBgYA0KDQpgYGB7cn0NCmdncGxvdChUaXRhbmljLnRyYWluLCBhZXMoU2liU3ApKSArIGdlb21fYmFyKGFlcyhmaWxsPWZhY3RvcihQY2xhc3MpKSwgYWxwaGE9MC44KQ0KYGBgDQpgYGB7cn0NCmdncGxvdChUaXRhbmljLnRyYWluLCBhZXMoRmFyZSkpICtnZW9tX2hpc3RvZ3JhbShmaWxsPSJncmVlbiIsIGNvbG9yPSJ3aGl0ZSIsIGFscGhhPTAuNykgKyB0aGVtZV9mZXcoKQ0KYGBgDQpgYGB7cn0NCmdncGxvdChUaXRhbmljLnRyYWluLCBhZXMoUGNsYXNzLEFnZSkpICsgZ2VvbV9ib3hwbG90KGFlcyhncm91cD1QY2xhc3MsIGZpbGw9ZmFjdG9yKFBjbGFzcykpLCBhbHBoYT0wLjUpICsgc2NhbGVfeV9jb250aW51b3VzKGJyZWFrcyA9IHNlcShtaW4oMCksbWF4KDgwKSxieT01KSkNCmBgYA0KYGBge3J9DQppbXB1dGUuYWdlIDwtIGZ1bmN0aW9uKGFnZSwgY2xhc3Mpew0KICBvdXQgPC0gYWdlDQogIGZvcihpIGluIDE6bGVuZ3RoKGFnZSkpew0KICAgIGlmKGlzLm5hKGFnZVtpXSkpew0KICAgICAgaWYgKGNsYXNzW2ldID09IDEpew0KICAgICAgICBvdXQgPC0gMzcNCiAgICAgIH0gZWxzZSBpZiAoY2xhc3NbaV09PTIpew0KICAgICAgICBvdXQgPC0gMjkNCiAgICAgIH0gZWxzZSB7DQogICAgICAgIG91dCA8LSAyNA0KICAgICAgfQ0KICAgIH1lbHNlIHsNCiAgICAgIG91dCA8LSBhZ2VbaV0NCiAgICB9DQogIH0NCiAgcmV0dXJuKG91dCkNCn0NCmZpeC5hZ2UgPC0gaW1wdXRlLmFnZShUaXRhbmljLnRyYWluJEFnZSwgVGl0YW5pYy50cmFpbiRQY2xhc3MpDQpUaXRhbmljLnRyYWluJEFnZSA8LSBmaXguYWdlDQpgYGANCg0KYGBge3J9DQpsaWJyYXJ5KEFtZWxpYSkNCm1pc3NtYXAoVGl0YW5pYy50cmFpbiwgbWFpbiA9ICJNaXNzaW5nIGRhdGEgZGV0ZWN0aXZlX1RpdGFuaWMgVHJhaW5pbmcgRGF0YSIsIGNvbCA9IGMoInJlZCIsImJsdWUiKSwgbGVnZW5kID0gRikNCmBgYA0KYGBge3J9DQpzdHIoVGl0YW5pYy50cmFpbikNCmBgYA0KYGBge3J9DQpsaWJyYXJ5KGRwbHlyKQ0KVGl0YW5pYy50cmFpbi5OIDwtIHNlbGVjdChUaXRhbmljLnRyYWluLCAtUGFzc2VuZ2VySWQsIC1OYW1lLCAtVGlja2V0LCAtQ2FiaW4pDQpoZWFkKFRpdGFuaWMudHJhaW4uTiwgNCkNCnN0cihUaXRhbmljLnRyYWluLk4pDQpgYGANCmBgYHtyfQ0KVGl0YW5pYy50cmFpbi5OJFN1cnZpdmVkIDwtIGZhY3RvcihUaXRhbmljLnRyYWluLk4kU3Vydml2ZWQpDQpUaXRhbmljLnRyYWluLk4kUGNsYXNzIDwtIGZhY3RvcihUaXRhbmljLnRyYWluLk4kUGNsYXNzKQ0KVGl0YW5pYy50cmFpbi5OJFBhcmNoIDwtIGZhY3RvcihUaXRhbmljLnRyYWluLk4kUGFyY2gpDQpUaXRhbmljLnRyYWluLk4kU2liU3AgPC0gZmFjdG9yKFRpdGFuaWMudHJhaW4uTiRTaWJTcCkNCnN0cihUaXRhbmljLnRyYWluLk4pDQpgYGANCg0KYGBge3J9DQpUaXRhbmljLkxvZ2lzdGljIDwtIGdsbShmb3JtdWxhID0gU3Vydml2ZWQgfiAuLCBmYW1pbHkgPWJpbm9taWFsKGxpbms9J2xvZ2l0JyksIGRhdGEgPSBUaXRhbmljLnRyYWluLk4pDQpzdW1tYXJ5KFRpdGFuaWMuTG9naXN0aWMpDQpgYGANCg0KDQoNCg==