## Including Plots
You can also embed plots, for example:
```r
library(readr)
library(rpart)
library(rpart.plot)
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(RColorBrewer)
library(e1071)
#1,2
t3 <- read_csv("Downloads/titanic3.csv")
## Rows: 1309 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): pclass, name, sex, ticket, cabin, embarked, boat, home.dest
## dbl (6): survived, age, sibsp, parch, fare, body
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(t3)
#3
titanic <- t3[, c("survived", "embarked", "sex","age", "sibsp", "parch", "fare")]
titanic
## # A tibble: 1,309 × 7
## survived embarked sex age sibsp parch fare
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 S female 29 0 0 211.
## 2 1 S male 0.92 1 2 152.
## 3 0 S female 2 1 2 152.
## 4 0 S male 30 1 2 152.
## 5 0 S female 25 1 2 152.
## 6 1 S male 48 0 0 26.6
## 7 1 S female 63 1 0 78.0
## 8 0 S male 39 0 0 0
## 9 1 S female 53 2 0 51.5
## 10 0 C male 71 0 0 49.5
## # ℹ 1,299 more rows
#4
summary(titanic)
## survived embarked sex age
## Min. :0.000 Length:1309 Length:1309 Min. : 0.17
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:21.00
## Median :0.000 Mode :character Mode :character Median :28.00
## Mean :0.382 Mean :29.88
## 3rd Qu.:1.000 3rd Qu.:39.00
## Max. :1.000 Max. :80.00
## NA's :263
## sibsp parch fare
## Min. :0.0000 Min. :0.000 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.: 7.896
## Median :0.0000 Median :0.000 Median : 14.454
## Mean :0.4989 Mean :0.385 Mean : 33.295
## 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.: 31.275
## Max. :8.0000 Max. :9.000 Max. :512.329
## NA's :1
str(titanic)
## tibble [1,309 × 7] (S3: tbl_df/tbl/data.frame)
## $ survived: num [1:1309] 1 1 0 0 0 1 1 0 1 0 ...
## $ embarked: chr [1:1309] "S" "S" "S" "S" ...
## $ sex : chr [1:1309] "female" "male" "female" "male" ...
## $ age : num [1:1309] 29 0.92 2 30 25 48 63 39 53 71 ...
## $ sibsp : num [1:1309] 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : num [1:1309] 0 2 2 2 2 0 0 0 0 0 ...
## $ fare : num [1:1309] 211 152 152 152 152 ...
#6
titanic <- na.omit(titanic)
summary(titanic)
## survived embarked sex age
## Min. :0.0000 Length:1043 Length:1043 Min. : 0.17
## 1st Qu.:0.0000 Class :character Class :character 1st Qu.:21.00
## Median :0.0000 Mode :character Mode :character Median :28.00
## Mean :0.4075 Mean :29.81
## 3rd Qu.:1.0000 3rd Qu.:39.00
## Max. :1.0000 Max. :80.00
## sibsp parch fare
## Min. :0.0000 Min. :0.0000 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 8.05
## Median :0.0000 Median :0.0000 Median : 15.75
## Mean :0.5043 Mean :0.4219 Mean : 36.60
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 35.08
## Max. :8.0000 Max. :6.0000 Max. :512.33
#5
prop.table(table(titanic$survived))
##
## 0 1
## 0.5925216 0.4074784
#4,8
titanic_correlation <- t3[, c( "age", "sibsp", "parch", "fare","survived")]
tcor<-cor(titanic_correlation)
# calculate standard deviation for all attributes
sapply(titanic, sd)
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm =
## na.rm): NAs introduced by coercion
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm =
## na.rm): NAs introduced by coercion
## survived embarked sex age sibsp parch fare
## 0.4916009 NA NA 14.3662545 0.9130797 0.8406546 55.7536477
# calculate skewness for each variable
skew <- apply(titanic_correlation,2, skewness)
# display skewness, larger/smaller deviations from 0 show more skew
print(skew)
## age sibsp parch fare survived
## NA 3.835415 3.660674 NA 0.485290
#7
titanic$survived <- factor(titanic$survived)
titanic$embarked <- factor(titanic$embarked)
titanic$sex <- factor(titanic$sex)
# load package
library(corrplot)
## corrplot 0.92 loaded
# 9create correlation plot
corrplot(tcor, method="circle")
plot(tcor)
pairs(titanic)
#10,11
set.seed(1000)
train_index <- sample(1:nrow(titanic), 0.8 * nrow(titanic))
train <- titanic[train_index, ]
test <- titanic[-train_index, ]
#11) Build your training (till index 1046) and test (till index 1308) datasets
#dim(titanic)
#[1] 1043 7
#11
titanic_train<-titanic[1:834,]
titanic_test<- titanic[835:1043,]
#13
fit<- rpart(survived ~ sex + age + sibsp + parch +fare + embarked , data = titanic_train,
method = "class")
str(titanic)
## tibble [1,043 × 7] (S3: tbl_df/tbl/data.frame)
## $ survived: Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 1 2 1 ...
## $ embarked: Factor w/ 3 levels "C","Q","S": 3 3 3 3 3 3 3 3 3 1 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ age : num [1:1043] 29 0.92 2 30 25 48 63 39 53 71 ...
## $ sibsp : num [1:1043] 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : num [1:1043] 0 2 2 2 2 0 0 0 0 0 ...
## $ fare : num [1:1043] 211 152 152 152 152 ...
## - attr(*, "na.action")= 'omit' Named int [1:266] 16 38 41 47 60 70 71 75 81 107 ...
## ..- attr(*, "names")= chr [1:266] "16" "38" "41" "47" ...
#14,16 plot your regression tree and save plot into an image file
fit
## n= 834
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 834 373 0 (0.5527578 0.4472422)
## 2) sex=male 511 110 0 (0.7847358 0.2152642)
## 4) age>=10 483 89 0 (0.8157350 0.1842650) *
## 5) age< 10 28 7 1 (0.2500000 0.7500000) *
## 3) sex=female 323 60 1 (0.1857585 0.8142415)
## 6) fare< 10.1625 42 19 0 (0.5476190 0.4523810)
## 12) age>=19.5 28 9 0 (0.6785714 0.3214286) *
## 13) age< 19.5 14 4 1 (0.2857143 0.7142857) *
## 7) fare>=10.1625 281 37 1 (0.1316726 0.8683274)
## 14) sibsp>=3.5 7 1 0 (0.8571429 0.1428571) *
## 15) sibsp< 3.5 274 31 1 (0.1131387 0.8868613) *
fancyRpartPlot(fit)
#17 examine the tree obtained. what's the most "important" feature over
#which the tree first split?
#sex is the most important
#18 use the predict function with your model fit to make predictions on the test
#dataset and save it in a variable
predicts <- predict(fit,titanic_test,type = "class")
predicts
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 0 1 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 1 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 0 0 0 0 0 1 1 1 1 1 1 0 0 1 1 0 1 0 0 0
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 1 0 1 1
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 0 0 1 1 0 0 0 0 1 1 1 0 1 0 0 0 1 0 1 0
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0
## 201 202 203 204 205 206 207 208 209
## 0 0 0 1 0 1 0 0 0
## Levels: 0 1
#19 save your data frame in a data result that will have two columns
Result <- data.frame(passengersex = titanic_test$sex, survivals = predicts)
Result
## passengersex survivals
## 1 male 0
## 2 male 0
## 3 male 0
## 4 male 0
## 5 male 0
## 6 male 0
## 7 female 0
## 8 female 0
## 9 male 0
## 10 female 1
## 11 male 0
## 12 male 0
## 13 male 0
## 14 female 1
## 15 male 0
## 16 male 0
## 17 male 0
## 18 female 0
## 19 male 0
## 20 male 0
## 21 male 0
## 22 male 0
## 23 female 0
## 24 male 0
## 25 male 0
## 26 female 1
## 27 female 0
## 28 male 0
## 29 female 1
## 30 male 0
## 31 male 0
## 32 male 0
## 33 male 0
## 34 male 0
## 35 male 0
## 36 male 1
## 37 female 1
## 38 male 0
## 39 female 0
## 40 male 0
## 41 male 0
## 42 female 1
## 43 female 1
## 44 male 0
## 45 female 1
## 46 male 0
## 47 male 0
## 48 female 1
## 49 female 0
## 50 male 0
## 51 female 1
## 52 female 0
## 53 male 0
## 54 male 0
## 55 male 0
## 56 male 0
## 57 male 0
## 58 female 0
## 59 male 0
## 60 male 0
## 61 female 0
## 62 male 1
## 63 male 0
## 64 male 0
## 65 female 0
## 66 male 0
## 67 male 0
## 68 male 0
## 69 female 0
## 70 female 0
## 71 male 0
## 72 male 0
## 73 female 0
## 74 male 1
## 75 male 1
## 76 female 1
## 77 female 1
## 78 female 1
## 79 male 1
## 80 male 1
## 81 male 1
## 82 male 0
## 83 male 0
## 84 female 1
## 85 male 0
## 86 male 0
## 87 male 0
## 88 male 1
## 89 female 1
## 90 female 1
## 91 male 0
## 92 male 0
## 93 male 0
## 94 male 0
## 95 male 0
## 96 female 0
## 97 male 0
## 98 male 0
## 99 female 1
## 100 male 0
## 101 male 0
## 102 male 0
## 103 male 0
## 104 male 0
## 105 male 0
## 106 male 1
## 107 male 1
## 108 male 1
## 109 male 1
## 110 female 1
## 111 female 1
## 112 male 0
## 113 male 0
## 114 female 1
## 115 female 1
## 116 male 0
## 117 female 1
## 118 male 0
## 119 male 0
## 120 male 0
## 121 male 0
## 122 male 0
## 123 male 0
## 124 female 0
## 125 male 0
## 126 female 1
## 127 female 1
## 128 female 1
## 129 male 0
## 130 male 0
## 131 male 0
## 132 male 0
## 133 male 0
## 134 male 0
## 135 male 0
## 136 female 1
## 137 male 1
## 138 male 0
## 139 female 1
## 140 female 1
## 141 male 0
## 142 female 1
## 143 male 0
## 144 male 0
## 145 male 0
## 146 male 0
## 147 female 0
## 148 male 0
## 149 male 0
## 150 female 0
## 151 male 0
## 152 male 0
## 153 female 1
## 154 female 1
## 155 male 0
## 156 male 0
## 157 male 0
## 158 male 0
## 159 male 0
## 160 male 0
## 161 male 0
## 162 male 0
## 163 male 1
## 164 female 1
## 165 male 0
## 166 male 0
## 167 male 0
## 168 male 0
## 169 male 1
## 170 female 1
## 171 female 1
## 172 male 0
## 173 female 1
## 174 female 0
## 175 male 0
## 176 male 0
## 177 female 1
## 178 male 0
## 179 female 1
## 180 male 0
## 181 male 0
## 182 male 0
## 183 male 0
## 184 female 1
## 185 male 0
## 186 male 0
## 187 female 1
## 188 male 0
## 189 male 0
## 190 female 1
## 191 male 0
## 192 male 0
## 193 male 0
## 194 female 0
## 195 male 0
## 196 male 0
## 197 male 0
## 198 female 0
## 199 male 0
## 200 male 0
## 201 male 0
## 202 male 0
## 203 male 0
## 204 female 1
## 205 male 0
## 206 female 1
## 207 male 0
## 208 male 0
## 209 male 0
#20 save your data frame in a .csv file by using
write.csv(Result, file = "TitanicTree.csv", row.names = FALSE)
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.