##Use the categorical variables Pclass and Embarked in your analysis.
##Upload your zipped Rmd + HTML file or just your Rmd along with a link from RPubs.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
setwd("C:/Users/StarKid/Desktop/Data_Science/Data_101/week_5/IC10")
titanic <- read.csv("titanic_train.csv")
str(titanic)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
titanic$Pclass <- factor(titanic$Pclass,
c("1", "2", "3"),
labels = c("1st", "2nd", "3rd"))
titanic$Embarked <- factor(titanic$Embarked,
c("C", "Q", "S"),
labels = c("Cherbourg", "Queenstown", "Southampton"))
titanic %>% group_by(Pclass) %>% summarise(Avg_Price = mean(Fare, na.rm = T))
## # A tibble: 3 × 2
## Pclass Avg_Price
## <fct> <dbl>
## 1 1st 84.2
## 2 2nd 20.7
## 3 3rd 13.7
boxplot(titanic$Fare ~ titanic$Pclass, outline = F)
results <- aov(titanic$Fare ~ titanic$Pclass)
summary(results)
## Df Sum Sq Mean Sq F value Pr(>F)
## titanic$Pclass 2 776030 388015 242.3 <2e-16 ***
## Residuals 888 1421769 1601
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(results)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = titanic$Fare ~ titanic$Pclass)
##
## $`titanic$Pclass`
## diff lwr upr p adj
## 2nd-1st -63.492504 -72.91649 -54.068521 0.0000000
## 3rd-1st -70.479137 -78.14891 -62.809367 0.0000000
## 3rd-2nd -6.986633 -15.10638 1.133114 0.1079834
titanic %>% group_by(Embarked) %>% summarise(Avg_Price = mean(Fare, na.rm = T))
## # A tibble: 4 × 2
## Embarked Avg_Price
## <fct> <dbl>
## 1 Cherbourg 60.0
## 2 Queenstown 13.3
## 3 Southampton 27.1
## 4 <NA> 80
means <- tapply(titanic$Fare, titanic$Embarked, mean)
boxplot(titanic$Fare ~ titanic$Embarked, outline = F)
points(means, col = "black", pch = 19)
addmargins(table(titanic$Pclass, titanic$Embarked))
##
## Cherbourg Queenstown Southampton Sum
## 1st 85 2 127 214
## 2nd 17 3 164 184
## 3rd 66 72 353 491
## Sum 168 77 644 889
addmargins(table(titanic$Pclass, titanic$Sex))
##
## female male Sum
## 1st 94 122 216
## 2nd 76 108 184
## 3rd 144 347 491
## Sum 314 577 891
chisq.test(table(titanic$Pclass, titanic$Sex))
##
## Pearson's Chi-squared test
##
## data: table(titanic$Pclass, titanic$Sex)
## X-squared = 16.971, df = 2, p-value = 0.0002064
#t.test(titanic$Pclass, titanic$Embarked, conf.level = 0.99, alternative="greater")
#t.test(titanic$Pclass, titanic$Embarked, paired = TRUE)
#qqnorm(titanic$Pclass - titanic$Embarked)