get the data from here: https://drive.google.com/file/d/1O3-thqx_6cyWQ6z7VxIsPwxfDfWTtzXZ/view?usp=sharing
#reading the data into a data frame
data1<-read.csv("C:\\Users\\Lenovo\\Desktop\\stats-assignment 2\\titanic.csv") #change the path here
#Test to be used
nptest<-function(x)
{
Chisq<-chisq.test(x)
Fis<-fisher.test(x)
df<-data.frame(Method=c(Chisq$method, Fis$method),
P.Value=c(Chisq$p.value, Fis$p.value))
df
}
sigtest<-function(x,y)
{
kst<-ks.test(x,y)
wil<-wilcox.test(x,y)
df<-data.frame(Method=c(kst$method, wil$method),
P.Value=c(kst$p.value, wil$p.value))
df
}
normtest<-function(x)
{
library(nortest)
s<-shapiro.test(x)
ad<-ad.test(x)
cvm<-cvm.test(x)
ll<-lillie.test(x)
sf<-sf.test(x)
df<-data.frame(Method=c(s$method, ad$method, cvm$method, ll$method, sf$method),
P.Value=c(s$p.value, ad$p.value, cvm$p.value, ll$p.value, sf$p.value))
df
}
#having a table for age and survival
asu <- data.frame(data1$Age,data1$Survived)
#Dividing Survived and Not Survived
asu0<-subset(asu,asu$data1.Survived==0)
asu1<-subset(asu,asu$data1.Survived==1)
boxplot(asu0$data1.Age,asu1$data1.Age,main='Not - Survived Survived')
hist(asu0$data1.Age,main = 'histogram of Age - Not Survived',xlab = 'Age')
hist(asu1$data1.Age,main = 'histogram of Age - Survived',xlab = 'Age')
qqnorm(asu0$data1.Age,main = "QQ Normal plot for Age - Not Survived")
qqnorm(asu1$data1.Age,main = "QQ Normal plot for Age - Survived")
normtest(asu0$data1.Age)
## Warning: package 'nortest' was built under R version 3.4.1
## Warning in cvm.test(x): p-value is smaller than 7.37e-10, cannot be
## computed more accurately
## Method P.Value
## 1 Shapiro-Wilk normality test 1.461444e-09
## 2 Anderson-Darling normality test 5.014279e-16
## 3 Cramer-von Mises normality test 7.370000e-10
## 4 Lilliefors (Kolmogorov-Smirnov) normality test 2.646866e-16
## 5 Shapiro-Francia normality test 1.719147e-08
normtest(asu1$data1.Age)
## Method P.Value
## 1 Shapiro-Wilk normality test 0.0004996137
## 2 Anderson-Darling normality test 0.0017831268
## 3 Cramer-von Mises normality test 0.0068092810
## 4 Lilliefors (Kolmogorov-Smirnov) normality test 0.0322461921
## 5 Shapiro-Francia normality test 0.0020591830
The P-Value of the Normality tests are less than 0.05 , strong evidence against null hypothesis ( The data follows normal distribution)
Both the data for survived and not survived people did not follow normal distribution
Null Hypothesis: There is a no difference in Age distribution between those who survived and those who did not.
Alternate Hypothesis: There is a difference in Age Distribution between those who survived and those who did not
#Test
wilcox.test(asu0$data1.Age,asu1$data1.Age)
##
## Wilcoxon rank sum test with continuity correction
##
## data: asu0$data1.Age and asu1$data1.Age
## W = 73190, p-value = 0.1917
## alternative hypothesis: true location shift is not equal to 0
Results
The P-Value of the Wilcoxon rank sum test is 0.192. The is no strong evidence against Null hypothesis.
There is a No significant difference between the age of the people who have survived and who have not.
#Controlling for gender in the age table
gsu <- data.frame(data1$Gender,data1$Age,data1$Survived)
#Dividing Survived and Not Survived
gsu0<-subset(gsu,gsu$data1.Survived==0)
gsu1<-subset(gsu,gsu$data1.Survived==1)
barplot(table(gsu0$data1.Gender),main='Barplot for Not Survived people',ylab='Number of people')
barplot(table(gsu1$data1.Gender),main = 'Barplot for Survived People',ylab = 'Number of People')
#Dividing the table for gender
msu0<-subset(gsu0,gsu0$data1.Gender=='male')
fsu0<-subset(gsu0,gsu0$data1.Gender=='female')
msu1<-subset(gsu1,gsu1$data1.Gender=='male')
fsu1<-subset(gsu1,gsu1$data1.Gender=='female')
boxplot(msu0$data1.Age,msu1$data1.Age,fsu0$data1.Age,fsu1$data1.Age,main='Male_not_survived Male_Survived Female_Not_Survived Female_Survived',cex.main=0.9)
hist(msu0$data1.Age,main = 'histogram of Age - Not Survived - Male',xlab = 'Age')
hist(msu1$data1.Age,main = 'histogram of Age - Survived - Male',xlab = 'Age')
hist(fsu0$data1.Age,main = 'histogram of Age - Not Survived - Female',xlab = 'Age')
hist(fsu1$data1.Age,main = 'histogram of Age - Survived - Female',xlab = 'Age')
qqnorm(msu0$data1.Age,main = "QQ Normal plot for Age - Not Survived - Male")
qqnorm(msu1$data1.Age,main = "QQ Normal plot for Age - Survived - Male")
qqnorm(fsu0$data1.Age,main = "QQ Normal plot for Age - Not Survived - Female")
qqnorm(fsu1$data1.Age,main = "QQ Normal plot for Age - Survived - Female")
Normality test for both the samples
normtest(msu0$data1.Age)
## Warning in cvm.test(x): p-value is smaller than 7.37e-10, cannot be
## computed more accurately
## Method P.Value
## 1 Shapiro-Wilk normality test 6.368376e-10
## 2 Anderson-Darling normality test 2.227363e-16
## 3 Cramer-von Mises normality test 7.370000e-10
## 4 Lilliefors (Kolmogorov-Smirnov) normality test 6.088379e-15
## 5 Shapiro-Francia normality test 8.325134e-09
normtest(msu1$data1.Age)
## Method P.Value
## 1 Shapiro-Wilk normality test 0.004201276
## 2 Anderson-Darling normality test 0.008390771
## 3 Cramer-von Mises normality test 0.045051620
## 4 Lilliefors (Kolmogorov-Smirnov) normality test 0.059854415
## 5 Shapiro-Francia normality test 0.013508441
normtest(fsu0$data1.Age)
## Method P.Value
## 1 Shapiro-Wilk normality test 0.11296551
## 2 Anderson-Darling normality test 0.11530637
## 3 Cramer-von Mises normality test 0.08483381
## 4 Lilliefors (Kolmogorov-Smirnov) normality test 0.12109238
## 5 Shapiro-Francia normality test 0.11744076
normtest(fsu1$data1.Age)
## Method P.Value
## 1 Shapiro-Wilk normality test 0.0026901879
## 2 Anderson-Darling normality test 0.0006357403
## 3 Cramer-von Mises normality test 0.0008234442
## 4 Lilliefors (Kolmogorov-Smirnov) normality test 0.0001661670
## 5 Shapiro-Francia normality test 0.0077707718
Female Passengers who did not survive have a normally distributed data.
All the other categories like Male survived, Male not survived, Female survived data are not normal.
NUll Hypothesis: There is no signigicant difference in Age distribution between those who survived and those who did not for male.
Alternative Hypothesis: There is signigicant difference in Age distribution between those who survived and those who did not for male.
#Tests
sigtest(msu0$data1.Age,msu1$data1.Age)
## Warning in ks.test(x, y): p-value will be approximate in the presence of
## ties
## Method P.Value
## 1 Two-sample Kolmogorov-Smirnov test 0.002006277
## 2 Wilcoxon rank sum test with continuity correction 0.003961685
Results
The P-Value of both the two-sample KS test and Wilcoxon rank sum test were found to be less than 0.05.
There is a strong evidance against the Null Hypothesis.
There is signigicant difference in Age distribution between those who survived and those who did not for male.
NUll Hypothesis: There is no signigicant difference in Age distribution between those who survived and those who did not for Female
Alternative Hypothesis: There is signigicant difference in Age distribution between those who survived and those who did not for Female
#Tests
sigtest(fsu0$data1.Age,fsu1$data1.Age)
## Warning in ks.test(x, y): p-value will be approximate in the presence of
## ties
## Method P.Value
## 1 Two-sample Kolmogorov-Smirnov test 0.013259850
## 2 Wilcoxon rank sum test with continuity correction 0.005118923
Results
The P-Value of both the two-sample KS test and Wilcoxon rank sum test were found to be less than 0.05.
There is a strong evidance against the Null Hypothesis.
There is signigicant difference in Age distribution between those who survived and those who did not for Female.
#copy to staging table
asuc1<-asu1
asuc0<-asu0
asuc0<-cut(asuc0$data1.Age,c(0,18,30,55,150))
asuc1<-cut(asuc1$data1.Age,c(0,18,30,55,150))
x<-table(asuc1)
y<-table(asuc0)
x<-data.frame(t(t(table(asuc1))))
y<-data.frame(t(t(table(asuc0))))
x<-x[-2]
y<-y[-2]
#combining tables
AgeTable<-cbind(x,y$Freq)
#renaming columns
names(AgeTable)[names(AgeTable) == "asuc1"] <- "def"
names(AgeTable)[names(AgeTable) == "Freq"] <- "Survived"
names(AgeTable)[names(AgeTable) == "y$Freq"] <- "Un-Survived"
rownames(AgeTable)<-c("Young","Young Adult","Adult","Old")
AgeTable
## def Survived Un-Survived
## Young (0,18] 69 57
## Young Adult (18,30] 104 202
## Adult (30,55] 124 156
## Old (55,150] 16 28
#BarPlot
temp<-AgeTable$Survived
names(temp)<-c("young","young adult","Adult","old")
barplot(temp,main = 'Survived')
boxplot(temp, main='Boxplot for Survived')
temp<-AgeTable$`Un-Survived`
names(temp)<-c("young","young adult","Adult","old")
barplot(temp,main='Un-Survived')
boxplot(temp, main='Boxplot for Not-Survived')
Null Hypothesis: The Survival Probablity is not affected by the Age of the passanger The survival probability is homogenous across all the Age buckets
Alternate Hypothesis: The Survival Probability is affected by the Age of the Passanger The Survival Probability is Not Homogenous across all the Age buckets
AgeTable<-AgeTable[,-1]
chisq.test(AgeTable)
##
## Pearson's Chi-squared test
##
## data: AgeTable
## X-squared = 17.625, df = 3, p-value = 0.0005255
Results
The p value of the chi-quare test of Homogeneity is 2.2e-16, which is less than 0.05.
There is a strong evidence against the null hypothesis
The Survival Probability is affected by the Age Distribution.
#copy to staging table
gen<-data.frame(data1$Gender,data1$Survived)
gen0<-subset(gen,gen$data1.Survived==0)
gen1<-subset(gen,gen$data1.Survived==1)
gen0<-data.frame(t(t(table(gen0$data1.Gender))))
gen1<-data.frame(t(t(table(gen1$data1.Gender))))
gen0<-gen0[-2]
gen0<-gen0[-1]
gen1<-gen1[-2]
gen1<-gen1[-1]
#Combining Table
GenderTable<-cbind(gen0,gen1$Freq)
rownames(GenderTable)<-c("Female","Male")
colnames(GenderTable)<-c("Survived","Not-Survived")
GenderTable
## Survived Not-Survived
## Female 154 308
## Male 709 142
temp<-GenderTable$Survived
names(temp)<-c("Female","Male")
barplot(temp,main = "Survived")
temp<-GenderTable$`Not-Survived`
names(temp)<-c("Female","Male")
barplot(temp,main = "Not-Survived")
Null Hypothesis: There is no difference in the survival probability between the two genders
Alternative Hypothesis: There is a difference in the survival probability between the two genders
nptest(GenderTable)
## Method
## 1 Pearson's Chi-squared test with Yates' continuity correction
## 2 Fisher's Exact Test for Count Data
## P.Value
## 1 1.040403e-73
## 2 4.826448e-74
Results
Both the Chi-Sq test and Fisher test are giving P-Value less than 0.05.
There is a strong evidence against Null Hypothesis
The Survival Rate is dependent on the Gender based on chi square test for independence and Fisher’s Exact
temp<-subset(data1,data1$Survived==0)
temp<-subset(temp,temp$PClass=='1st')
class1n<-data.frame(temp$PClass,temp$Survived)
temp<-subset(data1,data1$Survived==1)
temp<-subset(temp,temp$PClass=='1st')
class1s<-data.frame(temp$PClass,temp$Survived)
temp<-subset(data1,data1$Survived==0)
temp<-subset(temp,temp$PClass=='2nd')
class2n<-data.frame(temp$PClass,temp$Survived)
temp<-subset(data1,data1$Survived==1)
temp<-subset(temp,temp$PClass=='2nd')
class2s<-data.frame(temp$PClass,temp$Survived)
temp<-subset(data1,data1$Survived==0)
temp<-subset(temp,temp$PClass=='3rd')
class3n<-data.frame(temp$PClass,temp$Survived)
temp<-subset(data1,data1$Survived==1)
temp<-subset(temp,temp$PClass=='3rd')
class3s<-data.frame(temp$PClass,temp$Survived)
class1s<-table(class1s)
class1n<-table(class1n)
class2s<-table(class2s)
class2n<-table(class2n)
class3n<-table(class3n)
class3s<-table(class3s)
ClassTable<-matrix(c(class1s[-2:-3],class1n[-2:-3],class2s[2],class2n[2],class3s[3],class3n[3]),ncol=2,byrow=TRUE)
rownames(ClassTable)<-c("Class-1","Class-2","Class-3")
colnames(ClassTable)<-c("Survived","Un-Survived")
ClassTable
## Survived Un-Survived
## Class-1 193 129
## Class-2 119 161
## Class-3 138 573
Null Hypothesis: The Survival probability is same across all the passanger class
Alternate Hypothesis: The Survival probability is not same across all the passanger class
nptest(ClassTable)
## Method P.Value
## 1 Pearson's Chi-squared test 3.852316e-38
## 2 Fisher's Exact Test for Count Data 2.791493e-38
Results The P-Value given by both Chi-Sq test and Fisher’s Exact test are less than 0.05.
There is a strong evidence against Null hypothesis.
The survival probability is not distributed evenly across the P-Class.
#The Gender Table Generator for Different classes
gtrans<-function(x)
{
gen<-data.frame(q6$Gender,q6$Survived)
gen0<-subset(gen,gen$q6.Survived==0)
gen1<-subset(gen,gen$q6.Survived==1)
gen0<-data.frame(t(t(table(gen0$q6.Gender))))
gen1<-data.frame(t(t(table(gen1$q6.Gender))))
gen0<-gen0[-2]
gen0<-gen0[-1]
gen1<-gen1[-2]
gen1<-gen1[-1]
#Combining Table
GenderTable<-cbind(gen0,gen1$Freq)
rownames(GenderTable)<-c("Female","Male")
colnames(GenderTable)<-c("Survived","Not-Survived")
return(GenderTable)
}
q6<-subset(data1,data1$PClass=='1st')
temp<-gtrans(q6)
temp
## Survived Not-Survived
## Female 9 134
## Male 120 59
Null Hypothesis: There is no difference between the survival proportion between the male and female passanger of class 1
Alternate Hypothesis: There is a significant difference between the survival proportion between the male and female passanger of class 1
nptest(temp)
## Method
## 1 Pearson's Chi-squared test with Yates' continuity correction
## 2 Fisher's Exact Test for Count Data
## P.Value
## 1 7.576739e-28
## 2 1.086515e-31
Results
The P-Value of both the Chi-Sq test and Fisher Exact Test is less than 0.05
There is a strong evidence against Null Hypothesis
Theere is a significant difference between the Survival Probablity of Male and Female of the class-1 Passangers.
q6<-subset(data1,data1$PClass=='2nd')
temp<-gtrans(q6)
temp
## Survived Not-Survived
## Female 13 94
## Male 148 25
Null Hypothesis: There is no difference between the survival proportion between the male and female passanger of class 2
Alternate Hypothesis: There is a significant difference between the survival proportion between the male and female passanger of class 2
nptest(temp)
## Method
## 1 Pearson's Chi-squared test with Yates' continuity correction
## 2 Fisher's Exact Test for Count Data
## P.Value
## 1 6.633397e-33
## 2 5.079791e-36
Results
The P-Value of both the Chi-Sq test and Fisher Exact Test is less than 0.05
There is a strong evidence against Null Hypothesis
Theere is a significant difference between the Survival Probablity of Male and Female of the class-2 Passangers.
q6<-subset(data1,data1$PClass=='3rd')
temp<-gtrans(q6)
temp
## Survived Not-Survived
## Female 132 80
## Male 441 58
Null Hypothesis: There is no difference between the survival proportion between the male and female passanger of class 3
Alternate Hypothesis: There is a significant difference between the survival proportion between the male and female passanger of class 3
chisq.test(temp)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: temp
## X-squared = 63.201, df = 1, p-value = 1.867e-15
Results
The P-Value of both the Chi-Sq test and Fisher Exact Test is less than 0.05
There is a strong evidence against Null Hypothesis
There is a significant difference between the Survival Probablity of Male and Female of the class-3 Passangers.