library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
#import dataset
titanic<-read.csv("C:/Users/Gilbert/Downloads/titanic2.csv.csv")
tally(titanic)
## n
## 1 891
titanic %>%
select(1:9) %>%
sample_n(size=10)
## PassengerId Survived Pclass
## 1 600 1 1
## 2 205 1 3
## 3 839 1 3
## 4 367 1 1
## 5 487 1 1
## 6 563 0 2
## 7 387 0 3
## 8 264 0 1
## 9 332 0 1
## 10 609 1 2
## Name Sex Age SibSp
## 1 Duff Gordon, Sir. Cosmo Edmund ("Mr Morgan") male 49.0 1
## 2 Cohen, Mr. Gurshon "Gus" male 18.0 0
## 3 Chip, Mr. Chang male 32.0 0
## 4 Warren, Mrs. Frank Manley (Anna Sophia Atkinson) female 60.0 1
## 5 Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby) female 35.0 1
## 6 Norman, Mr. Robert Douglas male 28.0 0
## 7 Goodwin, Master. Sidney Leonard male 1.0 5
## 8 Harrison, Mr. William male 40.0 0
## 9 Partner, Mr. Austen male 45.5 0
## 10 Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue) female 22.0 1
## Parch Ticket
## 1 0 PC 17485
## 2 0 A/5 3540
## 3 0 1601
## 4 0 110813
## 5 0 19943
## 6 0 218629
## 7 2 CA 2144
## 8 0 112059
## 9 0 113043
## 10 2 SC/Paris 2123
#summary
glimpse(titanic)
## Rows: 891
## Columns: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Survived <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
## $ Pclass <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
## $ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
## $ Sex <chr> "male", "female", "female", "female", "male", "male", "mal…
## $ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
## $ SibSp <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
## $ Parch <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
## $ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
## $ Cabin <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C…
## $ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…
##exploratory data analysis #1 perbandingan jenis kelamin
ggplot(titanic, aes(x=Sex)) +
geom_bar(fill="grey")+
labs(title = "Perbandingan Jenis Kelamin Penumpang Titanic")
#2 perbandingan data selamat atau tidak berdasarkan kelas
ggplot(titanic, aes(x=factor(Pclass),fill=factor(Survived)))+
geom_bar(position ="fill") +
labs(title="Proporsi Survival Berdasarkan Kelas")
#3 distribusi umur penumpang
ggplot(titanic, aes(x=Age))+
geom_histogram(bins=20,fill='purple')+
labs(title="Distribusi Umur Penumpang")
## Warning: Removed 177 rows containing non-finite outside the scale range
## (`stat_bin()`).
#4 boxplot
ggplot(titanic,aes(x=factor(Survived),y=Age,fill=factor(Survived))) +
geom_boxplot(outlier.color='red')+
labs(title="boxplot")
## Warning: Removed 177 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
#5 scatterplot
ggplot(titanic, aes(x=Age,y=Fare))+
geom_point()+
geom_smooth(method="lm",color='blue')+
labs(title="Pengaruh Umur Terhadap Harga Tiket")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 177 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 177 rows containing missing values or values outside the scale range
## (`geom_point()`).