#鐵達尼號titanic
#https://zh.wikipedia.org/wiki/%E6%B3%B0%E5%9D%A6%E5%B0%BC%E5%85%8B%E5%8F%B7
#https://www.youtube.com/watch?v=0QXrCMyxUq4
library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.1.0     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(titanic)##install.packages("titanic")
titanic <- titanic_train


#欄位名稱------------------------------------------

# PassengerId 乘客編號
# Survived 是否存活(0:否、1:是)
# Pclass 艙等(1:高等、2:中等、3:低等)
# Name 姓名
# Sex 性別
# Age 年齡(XX.5 表示預估年齡)
# SibSp 在船上的兄弟姊妹及配偶總數
# Parch 在船上的父母及子女總數
# Ticket 船票編號
# Fare 票價
# Cabin 座艙編號
# Embarked 登船港口(C:法國瑟堡、Q:紐西蘭皇后鎮、S:英格蘭南安普敦)
#QQQ鐵達尼號的這份名單內 , 真的有 Jack 和 Rose嗎?
#Q1 : 參加此遊輪的人是不是有錢人比較多呢?
#Q2 : 參加的男生還是女生比較多
#Q3 : 平均票價是多少錢呢?
#Q4 : 不同艙等的平均票價是多少錢呢?
#Q5 : 發生船難後,有多少人倖存和死亡?
#Q6 : 艙等和死亡的關係?
#Q7 : 性別和死亡的關係?
#Q8 : 票價與死亡的關係?
#Q9 : 年齡與死亡的關係?
####資料型態轉換###
str(titanic)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...
titanic$Pclass <- factor(titanic$Pclass, levels = c(1,2,3), labels = c("1st","2nd","3rd"))
titanic$Survived <- factor(titanic$Survived, levels = c(0,1), labels = c("died", "alive"))
titanic$Sex <- factor(titanic$Sex, levels = c("male","female"))

#QQQ鐵達尼號的這份名單內 , 真的有 Jack 和 Rose嗎?
which(titanic$Name %in% c('Jack','Rose'))
## integer(0)
#Q1 : 參加此遊輪的人是不是有錢人比較多呢?
table(titanic$Pclass)
## 
## 1st 2nd 3rd 
## 216 184 491
prop.table(table(titanic$Pclass))
## 
##       1st       2nd       3rd 
## 0.2424242 0.2065095 0.5510662
#長條圖geom_bar()
ggplot(data = titanic,aes(x=Pclass))+
  geom_bar()

#長條圖加上標題labs(title = "000", subtitle = "000")
ggplot(data = titanic,aes(x=Pclass))+
  geom_bar()+labs(title = "Pclass", subtitle = "peichang")

#Q2 : 參加的男生還是女生比較多
table(titanic$Sex)
## 
##   male female 
##    577    314
#性別人數長條圖geom_bar()
ggplot(data = titanic,aes(x=Sex))+
  geom_bar(fill="yellow")+labs(title = "Sex", subtitle = "peichang")

#不同艙等的性別人數長條圖
ggplot(data = titanic,aes(x=Sex,fill=Sex))+
  geom_bar()+labs(title = "Sex", subtitle = "peichang")

#不同艙等的性別人數長條圖(分組)geom_bar(position='dodge')
table(titanic$Pclass,titanic$Sex)
##      
##       male female
##   1st  122     94
##   2nd  108     76
##   3rd  347    144
ggplot(data = titanic,aes(x=Pclass,fill=Sex))+
  geom_bar(position='dodge')

#Q3 : 平均票價是多少錢呢?
summary(titanic$Fare)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    7.91   14.45   32.20   31.00  512.33
#票價直方圖
ggplot(data = titanic,aes(x=Fare))+
  geom_histogram(bins = 50)+
  labs(title = "Fare",subtitle = "peichang")

#不同港口的票價

#發現空白資料

#去除空白資料


#Q4 : 不同艙等的平均票價是多少錢呢?
tapply(titanic$Fare, titanic$Pclass, mean)
##      1st      2nd      3rd 
## 84.15469 20.66218 13.67555
#Q5 : 發生船難後,有多少人生存和死亡?
t1 <- table(titanic$Survived)
prop.table(t1)
## 
##      died     alive 
## 0.6161616 0.3838384
#Q6 : 艙等Pclass和存活Survived的關係?
table(titanic$Survived, titanic$Pclass)
##        
##         1st 2nd 3rd
##   died   80  97 372
##   alive 136  87 119
t2<-table(titanic$Survived, titanic$Pclass)
t2
##        
##         1st 2nd 3rd
##   died   80  97 372
##   alive 136  87 119
prop.table(t2,1)
##        
##               1st       2nd       3rd
##   died  0.1457195 0.1766849 0.6775956
##   alive 0.3976608 0.2543860 0.3479532
prop.table(t2,2)
##        
##               1st       2nd       3rd
##   died  0.3703704 0.5271739 0.7576375
##   alive 0.6296296 0.4728261 0.2423625
#長條圖
ggplot(data = titanic, aes(x=Pclass, fill=Survived))+
  geom_bar()

#橫條圖coord_flip()
ggplot(data = titanic, aes(x=Pclass, fill=Survived))+
  geom_bar()+
  coord_flip()

#放射圖coord_polar() 
ggplot(data = titanic, aes(x=Pclass, fill=Survived))+
  geom_bar()+
  coord_polar()

#依艙等Pclass分組呈現畫圖

#Q7 : 性別和死亡的關係?
table(titanic$Sex,titanic$Survived)
##         
##          died alive
##   male    468   109
##   female   81   233
t3 <- table(titanic$Sex,titanic$Survived)
t3
##         
##          died alive
##   male    468   109
##   female   81   233
prop.table(t3,2)
##         
##               died     alive
##   male   0.8524590 0.3187135
##   female 0.1475410 0.6812865
#不同性別的存活長條圖
ggplot(data = titanic,aes(x=Sex,fill=Survived))+
  geom_bar()

#橫條圖
ggplot(data = titanic,aes(x=Sex,fill=Survived))+
  geom_bar()+
  coord_flip()

#放射圖
ggplot(data = titanic,aes(x=Sex,fill=Survived))+
  geom_bar()+
  coord_polar()

#不同性別的存活長條圖(分組)


#Q8:票價與死亡的關係


#直方圖

#分圖facet_wrap()

#盒鬚圖 geom_boxplot()

#Q9:年齡與死亡的關係
tapply(titanic$Age, titanic$Survived, mean, na.rm=TRUE)
##     died    alive 
## 30.62618 28.34369