#wrangle
library('dplyr') #data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('tidyr') #data manipulation
library('readr') #data input
library('stringr') #tring manipulation
library('forcats') # factor manipulation
library('modelr') #factor manipulation
library('ggplot2') # data visualization
setwd('C:/Users/kub/OneDrive/Documents/VDE')
train= read.csv('datatrain.csv')
head(train)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male NA 0 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 S
## 6 330877 8.4583 Q
train$Survived <- factor(train$Survived)
head(train)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male NA 0 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 S
## 6 330877 8.4583 Q
summary(train)
## PassengerId Survived Pclass Name Sex
## Min. : 1.0 0:549 Min. :1.000 Length:891 Length:891
## 1st Qu.:223.5 1:342 1st Qu.:2.000 Class :character Class :character
## Median :446.0 Median :3.000 Mode :character Mode :character
## Mean :446.0 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:3.000
## Max. :891.0 Max. :3.000
##
## Age SibSp Parch Ticket
## Min. : 0.42 Min. :0.000 Min. :0.0000 Length:891
## 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000 Class :character
## Median :28.00 Median :0.000 Median :0.0000 Mode :character
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Fare Cabin Embarked
## Min. : 0.00 Length:891 Length:891
## 1st Qu.: 7.91 Class :character Class :character
## Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
pclass_counts <- train %>%
group_by(Pclass) %>%
summarise(count = n()) %>%
mutate(percentage = count / sum(count) * 100,
label = paste0(Pclass, " (", round(percentage, 1), "%)"))
# Define custom colors
custom_colors <- c("1" = "#4D869C", "2" = "#B3E8E5", "3" = "#F2F7A1")
p_pie_pclass = ggplot(pclass_counts, aes(x = "", y = percentage, fill = as.factor(Pclass))) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y") +
theme_void() +
theme(legend.position = "right") +
labs(fill = "Pclass") +
geom_text(aes(label = label), position = position_stack(vjust = 0.5)) +
scale_fill_manual(values = custom_colors)
p_pie_pclass
sex_counts <- train %>%
group_by(Sex) %>%
summarise(count = n()) %>%
mutate(percentage = count / sum(count) * 100,
label = paste0(Sex, " (", round(percentage, 1), "%)"))
custom_colors <- c("#4D869C", "#F2F7A1")
p_pie_sex <- ggplot(sex_counts, aes(x = "", y = percentage, fill = as.factor(Sex))) +
geom_bar(width = 1, stat = "identity") +
coord_polar("y") +
theme_void() +
theme(legend.position = "right") +
labs(fill = "Sex") +
geom_text(aes(label = label), position = position_stack(vjust = 0.5)) +
scale_fill_manual(values = custom_colors)
p_pie_sex
p_sibsp_bar = ggplot(train, aes(x = factor(SibSp), fill = Survived)) +
geom_bar(position = "dodge") +
labs(x = "Number of Siblings/Spouses (SibSp)", y = "Count", title = "Bar Plot of SibSp by Survival Status") +
scale_fill_manual(values = custom_colors) +
theme_minimal() +
theme(legend.position = "right")
p_sibsp_bar
p_fare_boxplot = ggplot(train, aes(x = as.factor(Pclass), y = Fare, fill = Survived)) +
geom_boxplot() +
labs(x = "Pclass", y = "Fare", title = "Boxplot of Fare by Class and Survival Status") +
scale_fill_manual(values = custom_colors) +
theme_minimal() +
theme(legend.position = "right")
p_fare_boxplot
p_embarked_bar = ggplot(train, aes(x = Embarked, fill = Survived)) +
geom_bar(position = "dodge") +
labs(x = "Embarked", y = "Count", title = "Bar Plot of Embarked Locations by Survival Status") +
scale_fill_manual(values = custom_colors) +
theme_minimal() +
theme(legend.position = "right")
p_embarked_bar
custom_colors <- c("#4D869C", "#F2F7A1")
p_age = ggplot(train) +
geom_freqpoly(mapping = aes(x = Age, color = Survived), binwidth = 1) +
scale_color_manual(values = custom_colors) +
theme(legend.position = "right")
p_sex = ggplot(train, mapping = aes(x = Sex, fill = Survived)) +
geom_bar(stat='count', position='fill') +
labs(x = 'Sex') +
scale_fill_manual(values = custom_colors) +
theme(legend.position = "right")
p_class = ggplot(train, mapping = aes(x = Pclass, fill = Survived, colour = Survived)) +
geom_bar(stat='count', position='fill') +
labs(x = 'Pclass') +
scale_fill_manual(values = custom_colors) +
theme(legend.position = "none")
p_emb = ggplot(train, aes(Embarked, fill = Survived)) +
geom_bar(stat='count', position='fill') +
labs(x = 'Embarked') +
scale_fill_manual(values = custom_colors) +
theme(legend.position = "none")
p_sib = ggplot(train, aes(SibSp, fill = Survived)) +
geom_bar(stat='count', position='fill') +
labs(x = 'SibSp') +
scale_fill_manual(values = custom_colors) +
theme(legend.position = "none")
p_par = ggplot(train, aes(Parch, fill = Survived)) +
geom_bar(stat='count', position='fill') +
labs(x = 'Parch') +
scale_fill_manual(values = custom_colors) +
theme(legend.position = "none")
p_fare = ggplot(train) +
geom_freqpoly(mapping = aes(Fare, color = Survived), binwidth = 0.05) +
scale_x_log10() +
scale_color_manual(values = custom_colors) +
theme(legend.position = "none")
p_age
Fig. 2
p_sex
Fig. 2
p_fare
Fig. 2
p_class
Fig. 2
p_emb
Fig. 2
p_sib
Fig. 2
p_par
Fig. 2