library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.3
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## Warning: package 'ggplot2' was built under R version 3.4.2
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.3
## Warning: package 'readr' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.3
## Warning: package 'dplyr' was built under R version 3.4.2
## Warning: package 'stringr' was built under R version 3.4.2
## Warning: package 'forcats' was built under R version 3.4.3
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#1loading data
train<-read_csv("C:/Users/Vaibhav Goyal/Desktop/simpl/projectsdone/titanic/train.csv")
## Parsed with column specification:
## cols(
## PassengerId = col_integer(),
## Survived = col_integer(),
## Pclass = col_integer(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_double(),
## SibSp = col_integer(),
## Parch = col_integer(),
## Ticket = col_character(),
## Fare = col_double(),
## Cabin = col_character(),
## Embarked = col_character()
## )
test<-read_csv("C:/Users/Vaibhav Goyal/Desktop/simpl/projectsdone/titanic/test.csv")
## Parsed with column specification:
## cols(
## PassengerId = col_integer(),
## Pclass = col_integer(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_double(),
## SibSp = col_integer(),
## Parch = col_integer(),
## Ticket = col_character(),
## Fare = col_double(),
## Cabin = col_character(),
## Embarked = col_character()
## )
colSums(is.na(train))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 687 2
#we have missing values in the columns Age, Embarked, and Cabin within our Titanic training set
colSums(is.na(test))
## PassengerId Pclass Name Sex Age SibSp
## 0 0 0 0 86 0
## Parch Ticket Fare Cabin Embarked
## 0 0 1 327 0
#2now to see how our data set is ,is it cleaned or not
#we have missing values in the columns Age, Cabin, and Fare in our Titanic test set
#we can see that our test set does not have the Survived column
#To fix this problem, let's combine these two tables into one so we can clean up these columns as a whole
#But in order to do that, we'll have to create a new data frame with the test data sets and add a new "Survived" variable because you cannot combine two data sets with different column numbers.
test.survived<-data.frame(Survived=rep("None",nrow(test)),test[,])
#now combine the two data set in 1
combined<-rbind(train,test.survived)
#3there are na value in various column now task is to fill those values
na.omit(combined)%>%
group_by(Pclass,Sex) %>%
summarise(Mean=mean(Age),Median=median(Age),SD=sd(Age),Total=n())
## Warning: package 'bindrcpp' was built under R version 3.4.2
## # A tibble: 6 x 6
## # Groups: Pclass [?]
## Pclass Sex Mean Median SD Total
## <int> <chr> <dbl> <dbl> <dbl> <int>
## 1 1 female 36.8 35.5 14.6 114
## 2 1 male 40.8 41.5 14.7 122
## 3 2 female 28.0 28.0 13.3 12
## 4 2 male 19.7 20.0 14.7 9
## 5 3 female 14.5 14.0 13.5 6
## 6 3 male 24.9 25.0 11.1 7
#From this table, it looks like the-re's a high standard deviation between the ages.
#This could possibly give us a bad prediction for the ages if we decided to use the mean
#or median as a replacement. Let's look at some more details on our table to see if there
#are further clues.
#as we can see Names of the person includes Mr. miss. Mrs. Master. which can give us a clue for age
#so we will make a new column title in our combined data set
combined<-mutate(combined,Title=str_extract(combined$Name,"[a-zA-Z]+\\."))
#combined <- combined %>% mutate(Title = str_extract(Name, "[a-zA-Z]+\\."))
na.omit(combined) %>%
group_by(Pclass, Sex, Title) %>%
summarise(Mean = mean(Age), Median = median(Age), SD = sd(Age), Total=n()) %>%
filter(Sex == 'male')
## # A tibble: 11 x 7
## # Groups: Pclass, Sex [3]
## Pclass Sex Title Mean Median SD Total
## <int> <chr> <chr> <dbl> <dbl> <dbl> <int>
## 1 1 male Capt. 70.0 70.0 NaN 1
## 2 1 male Col. 52.0 53.0 4.58 3
## 3 1 male Dr. 43.0 44.0 10.5 3
## 4 1 male Major. 48.5 48.5 4.95 2
## 5 1 male Master. 6.98 6.00 4.97 5
## 6 1 male Mr. 41.5 41.0 13.3 107
## 7 1 male Sir. 49.0 49.0 NaN 1
## 8 2 male Master. 2.00 2.00 1.00 3
## 9 2 male Mr. 28.5 30.0 8.11 6
## 10 3 male Master. 6.00 6.00 NaN 1
## 11 3 male Mr. 28.0 25.0 8.00 6
# "Master" could be considered as children or young boys.
na.omit(combined) %>%
group_by(Pclass, Sex, Title) %>%
summarise(Mean = mean(Age), Median = median(Age), SD = sd(Age), Total=n()) %>%
filter(Sex == 'female')
## # A tibble: 12 x 7
## # Groups: Pclass, Sex [3]
## Pclass Sex Title Mean Median SD Total
## <int> <chr> <chr> <dbl> <dbl> <dbl> <int>
## 1 1 female Countess. 33.0 33.0 NaN 1
## 2 1 female Dona. 39.0 39.0 NaN 1
## 3 1 female Dr. 49.0 49.0 NaN 1
## 4 1 female Lady. 48.0 48.0 NaN 1
## 5 1 female Miss. 29.9 30.0 12.5 47
## 6 1 female Mlle. 24.0 24.0 0 2
## 7 1 female Mme. 24.0 24.0 NaN 1
## 8 1 female Mrs. 42.4 44.5 14.4 60
## 9 2 female Miss. 19.9 24.0 11.6 5
## 10 2 female Mrs. 33.9 34.0 11.7 7
## 11 3 female Miss. 2.33 2.00 1.53 3
## 12 3 female Mrs. 26.7 27.0 2.52 3
#"Miss" title could be used to represent a more younger group of females.
#we can simplify the title values to four types: Mr , Master , Mrs , and Miss.
combined <- mutate(combined, Title2 =
ifelse((Sex=='male'), ifelse(Title == 'Master.', 'Master','Mr'),
ifelse(Title == 'Miss.', 'Miss', 'Mrs')))
missingAge <- combined %>%
group_by(Title2) %>%
summarize(meanAge = mean(na.omit(Age)))
#combined <- combined %>%
# left_join(missingAge, by = c("Title2")) %>%
#mutate(Age = ifelse(is.na(Age), meanAge, Age)) %>%
#select(-meanAge)
#Using our missingAge dataset, we can join this table into our combined dataset
#and fill in the missing Age values with the meanAge.
combined<-left_join(combined,missingAge, by = c("Title2"))
combined<-mutate(combined,Age = ifelse(is.na(Age), meanAge, Age))
combined<-select(combined,-meanAge)
table(combined$Embarked)
##
## C Q S
## 270 123 914
which(is.na(combined$Embarked))
## [1] 62 830
combined$Embarked[c(62,830)]<-'S'
table(combined$Embarked)
##
## C Q S
## 270 123 916
#Let's find out the median values for Fare and use it as a replacement for the missing value
na.omit(combined) %>% summarize(Median = median(Fare))
## # A tibble: 1 x 1
## Median
## <dbl>
## 1 57.0
which(is.na(combined$Fare))
## [1] 1044
combined$Fare[c(1044)]<-57
colSums(is.na(combined))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 0
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 1014 0
## Title Title2
## 0 0
#out of 1309 entry 1014 are missing values for cabin
#Remove the Cabin column from our combined data set
combined<-select(combined,-Cabin)
#4 exploratory data analysis of our data
#to find out pattrns
#Let's take a look at our two variables (Sex and Age) and see how they relate to our Survived variable.
ggplot(combined[1:891,],aes(Age,fill=factor(Survived)))+geom_histogram(bins = 30)+xlab("Age")+ylab("Count")+facet_grid(.~Sex)+scale_fill_discrete(name = "Survived") + ggtitle("Age vs Sex vs Survived")

#o means who died
#We can see here that a great amount of females survived as opposed to males.
#may be people favored females to survive
# males from the ages 20-50 had a massive death rate
#Males that were below the age of 20 had a considerable survival rate, which is probably due to the fact that they were children.
#Pclass vs Survived
ggplot(combined[1:891,], aes(Pclass,fill = factor(Survived))) +
geom_bar(stat = "count")+
xlab('PClass') +
ylab("Count") +
scale_fill_discrete(name = " Survived") +
ggtitle("Pclass vs Survived")

#From this graph, we can speculate that generally passengers in 1st class had more than a 50%
#survival rate. 2nd Class had about a 50% survival rate.
#And 3rd Class had below a 50% survival rate. The wealthier the individual meant the higher survival rate.
#Fare vs Survived
ggplot(combined[1:891,], aes(Fare, fill = factor(Survived))) +
geom_histogram() +
xlab("Fare") +
ylab("Count") +
ggtitle("Fare vs Survived")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Fare vs Survived
# a high correlation between wealth and survival rate.
#paid less than about $50 on their Fare price had less than a 50% chance of survival.
#As you move towards the right side of the graph, you tend to see a higher survival rate,
ggplot(combined[1:891,],aes(x=Embarked,fill=factor(Survived)))+
geom_bar(position="fill")+ylab("Frequency")

#IT INFER ONE WHO IS IN S PORT SURVIVED MORE
#which we can further support our statement of survival rate favoring the wealthy.
#females,children,wealthy having more survival rate
#sex,age,fare,pclass,Embarked,Title
# categorize the variables that we would want to use in our prediction into factors
#Make Sex, Embarked, Title, and Pclass into factors (categorical variables)
combined$Sex <- as.factor(combined$Sex)
combined$Embarked <- as.factor(combined$Embarked)
combined$Title2 <- as.factor(combined$Title2)
combined$Pclass <- as.factor(combined$Pclass)
#Split our combined data set back into the training set and test set
train <- combined[1:891,]
test <- combined[892:1309,]
#randomForest
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.2
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
#Create a random seed
set.seed(1234)
#Choose Pclass, Sex, Fare, Embarked, Title, and Age for our prediction of Survival
rf_model <- randomForest(factor(Survived) ~ Pclass + Sex + Fare + Embarked + Title2 + Age, data = train)
#Plot our variables in order of importance
varImpPlot(rf_model, main = "RF_MODEL")

rf_model
##
## Call:
## randomForest(formula = factor(Survived) ~ Pclass + Sex + Fare + Embarked + Title2 + Age, data = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 16.05%
## Confusion matrix:
## 0 1 class.error
## 0 509 40 0.07285974
## 1 103 239 0.30116959
#The estimate rate of error turned out to be 16.5%. Meaning that we had about a
#83.5% accuracy rate for our prediction using our test data set!
#Also, the RF_MODEL table showed us that Title, Fare, and Sex were the top three predictor
#variables for our prediction.
#Let's see how our model reflects the predictions on Kaggle's data set!
# Save the solution to a dataframe with two columns: PassengerId and Survived (prediction)
# Predict using the test set
prediction <- predict(rf_model, test)
# Save the solution to a dataframe with two columns: PassengerId and Survived (prediction)
solution <- data.frame(PassengerID = test$PassengerId, Survived = prediction)
# Write the solution to file
write.csv(solution, file = "C:/Users/Vaibhav Goyal/Desktop/simpl/projectsdone/titanic/solution.csv",row.names = F)