updatetitanic.R

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.4.3

## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --

## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.4.2     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0

## Warning: package 'ggplot2' was built under R version 3.4.2

## Warning: package 'tibble' was built under R version 3.4.3

## Warning: package 'tidyr' was built under R version 3.4.3

## Warning: package 'readr' was built under R version 3.4.3

## Warning: package 'purrr' was built under R version 3.4.3

## Warning: package 'dplyr' was built under R version 3.4.2

## Warning: package 'stringr' was built under R version 3.4.2

## Warning: package 'forcats' was built under R version 3.4.3

## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

#1loading data
train<-read_csv("C:/Users/Vaibhav Goyal/Desktop/simpl/projectsdone/titanic/train.csv")

## Parsed with column specification:
## cols(
##   PassengerId = col_integer(),
##   Survived = col_integer(),
##   Pclass = col_integer(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_double(),
##   SibSp = col_integer(),
##   Parch = col_integer(),
##   Ticket = col_character(),
##   Fare = col_double(),
##   Cabin = col_character(),
##   Embarked = col_character()
## )

test<-read_csv("C:/Users/Vaibhav Goyal/Desktop/simpl/projectsdone/titanic/test.csv")

## Parsed with column specification:
## cols(
##   PassengerId = col_integer(),
##   Pclass = col_integer(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_double(),
##   SibSp = col_integer(),
##   Parch = col_integer(),
##   Ticket = col_character(),
##   Fare = col_double(),
##   Cabin = col_character(),
##   Embarked = col_character()
## )

colSums(is.na(train))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0         687           2

#we have missing values in the columns Age, Embarked, and Cabin within our Titanic training set
colSums(is.na(test))

## PassengerId      Pclass        Name         Sex         Age       SibSp 
##           0           0           0           0          86           0 
##       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           1         327           0

#2now to see how our data set is ,is it cleaned or not
#we have missing values in the columns Age, Cabin, and Fare in our Titanic test set
#we can see that our test set does not have the Survived column
#To fix this problem, let's combine these two tables into one so we can clean up these columns as a whole
#But in order to do that, we'll have to create a new data frame with the test data sets and add a new "Survived" variable because you cannot combine two data sets with different column numbers.
test.survived<-data.frame(Survived=rep("None",nrow(test)),test[,])
#now combine the two data set in 1
combined<-rbind(train,test.survived)
#3there are na value in various column now task is to fill those values
na.omit(combined)%>%
  group_by(Pclass,Sex) %>%
  summarise(Mean=mean(Age),Median=median(Age),SD=sd(Age),Total=n())

## Warning: package 'bindrcpp' was built under R version 3.4.2

## # A tibble: 6 x 6
## # Groups:   Pclass [?]
##   Pclass Sex     Mean Median    SD Total
##    <int> <chr>  <dbl>  <dbl> <dbl> <int>
## 1      1 female  36.8   35.5  14.6   114
## 2      1 male    40.8   41.5  14.7   122
## 3      2 female  28.0   28.0  13.3    12
## 4      2 male    19.7   20.0  14.7     9
## 5      3 female  14.5   14.0  13.5     6
## 6      3 male    24.9   25.0  11.1     7

#From this table, it looks like the-re's a high standard deviation between the ages.
#This could possibly give us a bad prediction for the ages if we decided to use the mean 
#or median as a replacement. Let's look at some more details on our table to see if there
#are further clues.

#as we can see Names of the person includes Mr. miss. Mrs. Master. which can give us a clue for age
#so we will make a new column title in our combined data set
combined<-mutate(combined,Title=str_extract(combined$Name,"[a-zA-Z]+\\."))
#combined <- combined %>% mutate(Title = str_extract(Name, "[a-zA-Z]+\\."))
na.omit(combined) %>%
  group_by(Pclass, Sex, Title) %>%
  summarise(Mean = mean(Age), Median = median(Age), SD = sd(Age), Total=n()) %>%
  filter(Sex == 'male')

## # A tibble: 11 x 7
## # Groups:   Pclass, Sex [3]
##    Pclass Sex   Title    Mean Median     SD Total
##     <int> <chr> <chr>   <dbl>  <dbl>  <dbl> <int>
##  1      1 male  Capt.   70.0   70.0  NaN        1
##  2      1 male  Col.    52.0   53.0    4.58     3
##  3      1 male  Dr.     43.0   44.0   10.5      3
##  4      1 male  Major.  48.5   48.5    4.95     2
##  5      1 male  Master.  6.98   6.00   4.97     5
##  6      1 male  Mr.     41.5   41.0   13.3    107
##  7      1 male  Sir.    49.0   49.0  NaN        1
##  8      2 male  Master.  2.00   2.00   1.00     3
##  9      2 male  Mr.     28.5   30.0    8.11     6
## 10      3 male  Master.  6.00   6.00 NaN        1
## 11      3 male  Mr.     28.0   25.0    8.00     6

# "Master" could be considered as children or young boys.
na.omit(combined) %>%
  group_by(Pclass, Sex, Title) %>%
  summarise(Mean = mean(Age), Median = median(Age), SD = sd(Age), Total=n()) %>%
  filter(Sex == 'female')

## # A tibble: 12 x 7
## # Groups:   Pclass, Sex [3]
##    Pclass Sex    Title      Mean Median     SD Total
##     <int> <chr>  <chr>     <dbl>  <dbl>  <dbl> <int>
##  1      1 female Countess. 33.0   33.0  NaN        1
##  2      1 female Dona.     39.0   39.0  NaN        1
##  3      1 female Dr.       49.0   49.0  NaN        1
##  4      1 female Lady.     48.0   48.0  NaN        1
##  5      1 female Miss.     29.9   30.0   12.5     47
##  6      1 female Mlle.     24.0   24.0    0        2
##  7      1 female Mme.      24.0   24.0  NaN        1
##  8      1 female Mrs.      42.4   44.5   14.4     60
##  9      2 female Miss.     19.9   24.0   11.6      5
## 10      2 female Mrs.      33.9   34.0   11.7      7
## 11      3 female Miss.      2.33   2.00   1.53     3
## 12      3 female Mrs.      26.7   27.0    2.52     3

#"Miss" title could be used to represent a more younger group of females. 
#we can simplify the title values to four types: Mr , Master , Mrs , and Miss.
combined <- mutate(combined, Title2 = 
                     ifelse((Sex=='male'), ifelse(Title == 'Master.',  'Master','Mr'), 
                            ifelse(Title == 'Miss.', 'Miss', 'Mrs')))
missingAge <- combined %>%
  group_by(Title2) %>%
  summarize(meanAge = mean(na.omit(Age)))


#combined <- combined %>%
# left_join(missingAge, by = c("Title2")) %>%
#mutate(Age = ifelse(is.na(Age), meanAge, Age)) %>%
#select(-meanAge)
#Using our missingAge dataset, we can join this table into our combined dataset
#and fill in the missing Age values with the meanAge.
combined<-left_join(combined,missingAge, by = c("Title2"))
combined<-mutate(combined,Age = ifelse(is.na(Age), meanAge, Age))
combined<-select(combined,-meanAge)
table(combined$Embarked)

## 
##   C   Q   S 
## 270 123 914

which(is.na(combined$Embarked))

## [1]  62 830

combined$Embarked[c(62,830)]<-'S'
table(combined$Embarked)

## 
##   C   Q   S 
## 270 123 916

#Let's find out the median values for Fare and use it as a replacement for the missing value
na.omit(combined) %>% summarize(Median = median(Fare))

## # A tibble: 1 x 1
##   Median
##    <dbl>
## 1   57.0

which(is.na(combined$Fare))

## [1] 1044

combined$Fare[c(1044)]<-57
colSums(is.na(combined))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0           0 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0        1014           0 
##       Title      Title2 
##           0           0

#out of 1309 entry 1014 are missing values for cabin
#Remove the Cabin column from our combined data set
combined<-select(combined,-Cabin)
#4 exploratory data analysis of  our data
#to find out pattrns
#Let's take a look at our two variables (Sex and Age) and see how they relate to our Survived variable.
ggplot(combined[1:891,],aes(Age,fill=factor(Survived)))+geom_histogram(bins = 30)+xlab("Age")+ylab("Count")+facet_grid(.~Sex)+scale_fill_discrete(name = "Survived") + ggtitle("Age vs Sex vs Survived")

#o means who died
#We can see here that a great amount of females survived as opposed to males. 
#may be people favored females to survive
# males from the ages 20-50 had a massive death rate
#Males that were below the age of 20 had a considerable survival rate, which is probably due to the fact that they were children.

#Pclass vs Survived
ggplot(combined[1:891,], aes(Pclass,fill = factor(Survived))) +
  geom_bar(stat = "count")+
  xlab('PClass') +
  ylab("Count") +
  scale_fill_discrete(name = " Survived") + 
  ggtitle("Pclass vs Survived")

#From this graph, we can speculate that generally passengers in 1st class had more than a 50%
#survival rate. 2nd Class had about a 50% survival rate.
#And 3rd Class had below a 50% survival rate. The wealthier the individual meant the higher survival rate.

#Fare vs Survived
ggplot(combined[1:891,], aes(Fare, fill = factor(Survived))) + 
  geom_histogram() + 
  xlab("Fare") +
  ylab("Count") +
  ggtitle("Fare vs Survived")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Fare vs Survived
# a high correlation between wealth and survival rate. 
#paid less than about $50 on their Fare price had less than a 50% chance of survival. 
#As you move towards the right side of the graph, you tend to see a higher survival rate, 
ggplot(combined[1:891,],aes(x=Embarked,fill=factor(Survived)))+
  geom_bar(position="fill")+ylab("Frequency")

#IT INFER ONE WHO IS IN S PORT SURVIVED MORE
#which we can further support our statement of survival rate favoring the wealthy.

#females,children,wealthy having more survival rate
#sex,age,fare,pclass,Embarked,Title

# categorize the variables that we would want to use in our prediction into factors
#Make Sex, Embarked, Title, and Pclass into factors (categorical variables)
combined$Sex  <- as.factor(combined$Sex)
combined$Embarked  <- as.factor(combined$Embarked)
combined$Title2  <- as.factor(combined$Title2)
combined$Pclass  <- as.factor(combined$Pclass)
#Split our combined data set back into the training set and test set
train <- combined[1:891,]
test <- combined[892:1309,]
#randomForest
library(randomForest)

## Warning: package 'randomForest' was built under R version 3.4.2

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

#Create a random seed
set.seed(1234)
#Choose Pclass, Sex, Fare, Embarked, Title, and Age for our prediction of Survival
rf_model <- randomForest(factor(Survived) ~ Pclass + Sex + Fare + Embarked + Title2 + Age, data = train)
#Plot our variables in order of importance 
varImpPlot(rf_model, main = "RF_MODEL")

rf_model

## 
## Call:
##  randomForest(formula = factor(Survived) ~ Pclass + Sex + Fare +      Embarked + Title2 + Age, data = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 16.05%
## Confusion matrix:
##     0   1 class.error
## 0 509  40  0.07285974
## 1 103 239  0.30116959

#The estimate rate of error turned out to be 16.5%. Meaning that we had about a 
#83.5% accuracy rate for our prediction using our test data set! 
#Also, the RF_MODEL table showed us that Title, Fare, and Sex were the top three predictor 
#variables for our prediction.
#Let's see how our model reflects the predictions on Kaggle's data set!
# Save the solution to a dataframe with two columns: PassengerId and Survived (prediction)
# Predict using the test set
prediction <- predict(rf_model, test)
# Save the solution to a dataframe with two columns: PassengerId and Survived (prediction)
solution <- data.frame(PassengerID = test$PassengerId, Survived = prediction)

# Write the solution to file
write.csv(solution, file = "C:/Users/Vaibhav Goyal/Desktop/simpl/projectsdone/titanic/solution.csv",row.names = F)

updatetitanic.R

Vaibhav Goyal

Sun Feb 18 09:25:17 2018