This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# import dataset
titanic <- read.csv("C:/Users/asus/Downloads/titanic2.csv.csv")
tally(titanic)
## n
## 1 891
titanic %>%
select(1:9) %>%
sample_n(size = 10)
## PassengerId Survived Pclass
## 1 212 1 2
## 2 780 1 1
## 3 745 1 3
## 4 140 0 1
## 5 119 0 1
## 6 456 1 3
## 7 364 0 3
## 8 2 1 1
## 9 723 0 2
## 10 334 0 3
## Name Sex Age SibSp Parch
## 1 Cameron, Miss. Clear Annie female 35 0 0
## 2 Robert, Mrs. Edward Scott (Elisabeth Walton McMillan) female 43 0 1
## 3 Stranden, Mr. Juho male 31 0 0
## 4 Giglio, Mr. Victor male 24 0 0
## 5 Baxter, Mr. Quigg Edmond male 24 0 1
## 6 Jalsevac, Mr. Ivan male 29 0 0
## 7 Asim, Mr. Adola male 35 0 0
## 8 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 9 Gillespie, Mr. William Henry male 34 0 0
## 10 Vander Planke, Mr. Leo Edmondus male 16 2 0
## Ticket
## 1 F.C.C. 13528
## 2 24160
## 3 STON/O 2. 3101288
## 4 PC 17593
## 5 PC 17558
## 6 349240
## 7 SOTON/O.Q. 3101310
## 8 PC 17599
## 9 12233
## 10 345764
# summary
glimpse(titanic)
## Rows: 891
## Columns: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Survived <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
## $ Pclass <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
## $ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
## $ Sex <chr> "male", "female", "female", "female", "male", "male", "mal…
## $ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
## $ SibSp <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
## $ Parch <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
## $ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
## $ Cabin <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C…
## $ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…
str(titanic)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
summary(titanic)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
# Exploratory Data Analysis
# 1. Perbandingan jenis kelamin
ggplot(titanic, aes(x=Sex)) +
geom_bar(fill='pink') + labs(title= "Perbandingan Jenis Kelamin di Titanic")
# 2. Perbandingan data selamat atau tidak
ggplot(titanic, aes(x=factor(Pclass), fill = factor(Survived))) +
geom_bar(position = "fill") + labs(title = "Proporsi Survival berdasarkan kelas")
# 3. Distribusi umur penumpang
ggplot(titanic, aes(x=Age)) +
geom_histogram(bins= 20, fill = 'purple') + labs(title ="Distribusi umur penumpang")
## Warning: Removed 177 rows containing non-finite outside the scale range
## (`stat_bin()`).
# 4. Boxplot
ggplot(titanic, aes(x=factor(Survived), y=Age, fill=factor(Survived))) + geom_boxplot() +
labs(title = "Boxplot")
## Warning: Removed 177 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# 5. Scatterplot
ggplot(titanic, aes(x=Age, y=Fare)) + geom_point() +
geom_smooth(method = "lm", color= 'blue') +
labs(title ="Pengaruh umur terhadap harga tiket")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 177 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 177 rows containing missing values or values outside the scale range
## (`geom_point()`).