Github Code: Hoon0427/RPubs
File Download: Titanic Data
library(tidyverse) #Tidyverse 패키지
library(ggplot2) #시각화 패키지
library(plotly) #반응형 시각화 패키지
library(rpart) # 의사결정 나무
library(rpart.plot) # 의사결정 나무 시각화
library(caret) # 데이터 처리 패키지
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(e1071) #혼동행렬 패키지
library(randomForest) #Random Forest 패키지
training_set <- read.csv("train.csv")
test_set <- read.csv("test.csv")
str(training_set)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
training_set$Pclass <- as.factor(training_set$Pclass)
training_set$Name <- as.character(training_set$Name)
training_set$Ticket <- as.character(training_set$Ticket)
training_set$Cabin <- as.character(training_set$Cabin)
str(training_set)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
test_set$Pclass <- as.factor(test_set$Pclass)
test_set$Name <- as.character(test_set$Name)
test_set$Ticket <- as.character(test_set$Ticket)
test_set$Cabin <- as.character(test_set$Cabin)
str(test_set)
## 'data.frame': 418 obs. of 11 variables:
## $ PassengerId: int 892 893 894 895 896 897 898 899 900 901 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 3 2 3 3 3 3 2 3 3 ...
## $ Name : chr "Kelly, Mr. James" "Wilkes, Mrs. James (Ellen Needs)" "Myles, Mr. Thomas Francis" "Wirz, Mr. Albert" ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 2 2 1 2 1 2 1 2 ...
## $ Age : num 34.5 47 62 27 22 14 30 26 18 21 ...
## $ SibSp : int 0 1 0 0 1 0 0 1 0 2 ...
## $ Parch : int 0 0 0 0 1 0 0 1 0 0 ...
## $ Ticket : chr "330911" "363272" "240276" "315154" ...
## $ Fare : num 7.83 7 9.69 8.66 12.29 ...
## $ Cabin : chr "" "" "" "" ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 2 3 2 3 3 3 2 3 1 3 ...
test_set$Age[is.na(test_set$Age)] <- mean(test_set$Age, na.rm = T)
sapply(test_set, function(x){
sum(is.na(x))
})
## PassengerId Pclass Name Sex Age SibSp
## 0 0 0 0 0 0
## Parch Ticket Fare Cabin Embarked
## 0 0 1 0 0
summary(training_set)
## PassengerId Survived Pclass Name Sex
## Min. : 1.0 Min. :0.0000 1:216 Length:891 female:314
## 1st Qu.:223.5 1st Qu.:0.0000 2:184 Class :character male :577
## Median :446.0 Median :0.0000 3:491 Mode :character
## Mean :446.0 Mean :0.3838
## 3rd Qu.:668.5 3rd Qu.:1.0000
## Max. :891.0 Max. :1.0000
##
## Age SibSp Parch Ticket
## Min. : 0.42 Min. :0.000 Min. :0.0000 Length:891
## 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000 Class :character
## Median :28.00 Median :0.000 Median :0.0000 Mode :character
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Fare Cabin Embarked
## Min. : 0.00 Length:891 : 2
## 1st Qu.: 7.91 Class :character C:168
## Median : 14.45 Mode :character Q: 77
## Mean : 32.20 S:644
## 3rd Qu.: 31.00
## Max. :512.33
##
Pclass 는 1등급이 216명, 2등급이 184 명, 3등급이 491명으로 구성되어 있다.
성별은 남자가 314명, 여자가 577명으로 분포되어 있다.
나이는 최솟값이 0.42 세, 최댓값이 80 세로 되어있다. 그리고 평균은 29.7세, 1분위수는 20.12세, 3분위수는 38세, NA가 177명 인것으로 봐서 당시 나이가 제대로 파악되지 않았음을 알 수 있다.
함께 탑승한 형제 또는 배우자의 수는 최대 8명 그리고 평균적으로 0.5명인 것으로 보인다.
함께 탑승한 부모 또는 자녀의 수는 최대 6명이고 평균이 0.38명으로 보인다.
sum(is.na(training_set))
## [1] 177
sapply(training_set, function(x){
sum(is.na(x))
})
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 0 0
training_set<-na.omit(training_set)
sum(is.na(training_set))
## [1] 0
training_set <- training_set %>%
mutate(Ages = case_when(
Age < 10 ~ "Under 10",
Age < 20 ~ "10 ~ 20",
Age < 30 ~ "20 ~ 30",
Age < 40 ~ "30 ~ 40",
Age < 50 ~ "40 ~ 50",
Age < 60 ~ "50 ~ 60",
TRUE ~ "over 60"
))
training_set$Ages <-
factor(training_set$Ages,
levels = c("Under 10", "10 ~ 20", "20 ~ 30", "30 ~ 40", "40 ~ 50", "50 ~ 60", "over 60"))
data_cleanging <- training_set %>%
group_by(Ages) %>%
summarise(Ages_count = n())
ggplot(data_cleanging, aes(x = Ages, y = Ages_count, fill=Ages)) +
geom_col() +
geom_text(aes(label=(Ages_count)), vjust=3, hjust = 0.5,color="black", size=4) +
theme(axis.text.x = element_text(size=10)) +
theme(axis.text.y = element_text(size=10))
ggplot_data<- ggplot(training_set, aes(x=Survived, fill = Sex)) +
geom_bar() +
ggtitle("성별에 따른 생존 여부") +
theme_bw()
ggplotly(ggplot_data, height = 500, width=800)
ggplot_data <- ggplot(training_set, aes(x = Survived, fill = Pclass)) +
geom_bar() +
ggtitle(" Pclass에 따른 생존 여부 ") +
theme_bw()
ggplotly(ggplot_data, height = 500, width = 800)
ggplot_data <- training_set %>%
ggplot(aes(x = Survived, fill = Ages)) +
geom_bar() +
ggtitle(" 나이에 따른 생존 여부 ") +
theme_bw()
ggplotly(ggplot_data, height = 500, width = 800)
ggplot_data <- training_set %>%
ggplot(aes( x = Survived, fill = factor(SibSp))) +
geom_bar() +
ggtitle( "같이 탑승한 배우자 또는 형제에 따른 생존여부") +
theme_bw()
ggplotly(ggplot_data, height = 500, width = 800)
ggplot_data <- training_set %>%
ggplot(aes( x = Survived, fill = factor(Parch))) +
geom_bar() +
ggtitle( "함께 탑승한 부모 또는 자녀의 수에 따른 생존여부") +
theme_bw()
ggplotly(ggplot_data, height = 500, width = 800)
training_set$Survived <- as.factor(training_set$Survived)
str(training_set)
## 'data.frame': 714 obs. of 13 variables:
## $ PassengerId: int 1 2 3 4 5 7 8 9 10 11 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 1 3 3 2 3 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
## $ SibSp : int 1 1 0 1 0 0 3 0 1 1 ...
## $ Parch : int 0 0 0 0 0 0 1 2 0 1 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 4 4 4 2 4 ...
## $ Ages : Factor w/ 7 levels "Under 10","10 ~ 20",..: 3 4 3 4 4 6 1 3 2 1 ...
# 의사결정나무 모델 사용
rpart_m <- rpart(Survived ~ Pclass + Age + Sex, data = training_set)
# 의사결정나무 시각화
prp(rpart_m, type=4, extra=2, digits=3)
# test set 확인
rpart_p <- predict(rpart_m, newdata=test_set, type = "class")
# RandomForest 모델 생성
rf_m <- randomForest(Survived ~ Pclass + Age + Sex, data = training_set)
# importance
rf_info <- randomForest(Survived ~ Sex + Age + Pclass , data = training_set, importance = T)
# 데이터의 중요도 확인
importance(rf_info)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## Sex 44.26519 57.01307 53.26653 82.21721
## Age 17.41469 16.78564 24.13932 20.54890
## Pclass 23.51503 28.04113 28.98233 34.09888
#데이터의 중요도 시각화
varImpPlot(rf_info)
# test 결과 확인
rf_p <- predict(rf_m, newdata=test_set, type="class")
# 의사결정 나무 제출 데이터
Titanic_rpart <- data.frame(PassengerID = test_set$PassengerId, Survived = rpart_p)
write.csv(Titanic_rpart, file = "Titanic_rpart_submit.csv", row.names = FALSE)
# 랜덤포레스트 제출 데이터
Titanic_rf <- data.frame(PassengerID = test_set$PassengerId, Survived = rf_p)
write.csv(Titanic_rf, file = "Titanic_rf_submit.csv", row.names = FALSE)