library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3

#import dataset

titanic<-read.csv("C:/Users/Gilbert/Downloads/titanic2.csv.csv")
tally(titanic)
##     n
## 1 891
titanic %>%
  select(1:9) %>%
  sample_n(size=10)
##    PassengerId Survived Pclass
## 1          600        1      1
## 2          205        1      3
## 3          839        1      3
## 4          367        1      1
## 5          487        1      1
## 6          563        0      2
## 7          387        0      3
## 8          264        0      1
## 9          332        0      1
## 10         609        1      2
##                                                     Name    Sex  Age SibSp
## 1           Duff Gordon, Sir. Cosmo Edmund ("Mr Morgan")   male 49.0     1
## 2                               Cohen, Mr. Gurshon "Gus"   male 18.0     0
## 3                                        Chip, Mr. Chang   male 32.0     0
## 4       Warren, Mrs. Frank Manley (Anna Sophia Atkinson) female 60.0     1
## 5        Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby) female 35.0     1
## 6                             Norman, Mr. Robert Douglas   male 28.0     0
## 7                        Goodwin, Master. Sidney Leonard   male  1.0     5
## 8                                  Harrison, Mr. William   male 40.0     0
## 9                                    Partner, Mr. Austen   male 45.5     0
## 10 Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue) female 22.0     1
##    Parch        Ticket
## 1      0      PC 17485
## 2      0      A/5 3540
## 3      0          1601
## 4      0        110813
## 5      0         19943
## 6      0        218629
## 7      2       CA 2144
## 8      0        112059
## 9      0        113043
## 10     2 SC/Paris 2123
#summary
glimpse(titanic)
## Rows: 891
## Columns: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
## $ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
## $ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
## $ Sex         <chr> "male", "female", "female", "female", "male", "male", "mal…
## $ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
## $ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
## $ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
## $ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
## $ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
## $ Cabin       <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C…
## $ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…

##exploratory data analysis #1 perbandingan jenis kelamin

ggplot(titanic, aes(x=Sex)) +
  geom_bar(fill="grey")+
  labs(title = "Perbandingan Jenis Kelamin Penumpang Titanic")

#2 perbandingan data selamat atau tidak berdasarkan kelas

ggplot(titanic, aes(x=factor(Pclass),fill=factor(Survived)))+
  geom_bar(position ="fill") +
  labs(title="Proporsi Survival Berdasarkan Kelas")

#3 distribusi umur penumpang

ggplot(titanic, aes(x=Age))+
  geom_histogram(bins=20,fill='purple')+
  labs(title="Distribusi Umur Penumpang")
## Warning: Removed 177 rows containing non-finite outside the scale range
## (`stat_bin()`).

#4 boxplot

ggplot(titanic,aes(x=factor(Survived),y=Age,fill=factor(Survived))) +
  geom_boxplot(outlier.color='red')+
  labs(title="boxplot")
## Warning: Removed 177 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

#5 scatterplot

ggplot(titanic, aes(x=Age,y=Fare))+
  geom_point()+
  geom_smooth(method="lm",color='blue')+
  labs(title="Pengaruh Umur Terhadap Harga Tiket")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 177 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 177 rows containing missing values or values outside the scale range
## (`geom_point()`).