##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male 28 0 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 Unknown S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 Unknown S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 Unknown S
## 6 330877 8.4583 Unknown Q
## vars n mean sd median trimmed mad min max range
## PassengerId 1 891 446.00 257.35 446.00 446.00 330.62 1.00 891.00 890.00
## Survived 2 891 0.38 0.49 0.00 0.35 0.00 0.00 1.00 1.00
## Pclass 3 891 2.31 0.84 3.00 2.39 0.00 1.00 3.00 2.00
## Name* 4 891 446.00 257.35 446.00 446.00 330.62 1.00 891.00 890.00
## Sex* 5 891 1.65 0.48 2.00 1.68 0.00 1.00 2.00 1.00
## Age 6 891 29.36 13.02 28.00 28.83 8.90 0.42 80.00 79.58
## SibSp 7 891 0.52 1.10 0.00 0.27 0.00 0.00 8.00 8.00
## Parch 8 891 0.38 0.81 0.00 0.18 0.00 0.00 6.00 6.00
## Ticket* 9 891 339.52 200.83 338.00 339.65 268.35 1.00 681.00 680.00
## Fare 10 891 32.20 49.69 14.45 21.38 10.24 0.00 512.33 512.33
## Cabin* 11 891 131.74 36.02 148.00 141.65 0.00 1.00 148.00 147.00
## Embarked* 12 891 2.54 0.79 3.00 2.67 0.00 1.00 3.00 2.00
## skew kurtosis se
## PassengerId 0.00 -1.20 8.62
## Survived 0.48 -1.77 0.02
## Pclass -0.63 -1.28 0.03
## Name* 0.00 -1.20 8.62
## Sex* -0.62 -1.62 0.02
## Age 0.51 0.97 0.44
## SibSp 3.68 17.73 0.04
## Parch 2.74 9.69 0.03
## Ticket* 0.00 -1.28 6.73
## Fare 4.77 33.12 1.66
## Cabin* -2.16 3.38 1.21
## Embarked* -1.26 -0.22 0.03
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 28 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "Unknown" "C85" "Unknown" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:22.00 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.36 Mean :0.523 Mean :0.3816
## 3rd Qu.:35.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
## [1] 891 12
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 0
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 0 0
## [1] FALSE
##
## 0 1
## 549 342
#Survival Proportions by Gender and class (using prop.table()
##
## 0 1
## female 0.2579618 0.7420382
## male 0.8110919 0.1889081
##
## 0 1
## 1 0.3703704 0.6296296
## 2 0.5271739 0.4728261
## 3 0.7576375 0.2423625
## [1] 32.20421
## [1] 14.4542
## [1] 0.0000 512.3292
## Name Age Survived Pclass
## 1 Barkworth, Mr. Algernon Henry Wilson 80.0 1 1
## 2 Svensson, Mr. Johan 74.0 0 3
## 3 Goldschmidt, Mr. George B 71.0 0 1
## 4 Artagaveytia, Mr. Ramon 71.0 0 1
## 5 Connors, Mr. Patrick 70.5 0 3
## 6 Mitchell, Mr. Henry Michael 70.0 0 2
## Pclass Age
## 1 1 36.81213
## 2 2 29.76538
## 3 3 25.93263
data %>%
group_by(Pclass) %>%
summarise(
avg_Age = mean(Age, na.rm = TRUE),
avg_Fare = mean(Fare, na.rm = TRUE),
survival_rate = mean(as.numeric(Survived) - 1, na.rm = TRUE)
)## # A tibble: 3 × 4
## Pclass avg_Age avg_Fare survival_rate
## <int> <dbl> <dbl> <dbl>
## 1 1 36.8 84.2 -0.370
## 2 2 29.8 20.7 -0.527
## 3 3 25.9 13.7 -0.758
data %>%
group_by(Pclass) %>%
summarise(survival_rate = mean(as.numeric(Survived) - 1, na.rm = TRUE)) %>%
arrange(desc(survival_rate)) %>%
slice(1)## # A tibble: 1 × 2
## Pclass survival_rate
## <int> <dbl>
## 1 1 -0.370
data <- data %>%
mutate(age_category = ifelse(Age >= 60, "Senior",
ifelse(Age < 18, "Child", "Adult")))
head(data[, c("Name", "Age", "age_category")])## Name Age age_category
## 1 Braund, Mr. Owen Harris 22 Adult
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) 38 Adult
## 3 Heikkinen, Miss. Laina 26 Adult
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 35 Adult
## 5 Allen, Mr. William Henry 35 Adult
## 6 Moran, Mr. James 28 Adult
data %>%
group_by(Sex) %>%
summarise(
avg_Age = mean(Age, na.rm = TRUE),
survival_rate = mean(as.numeric(Survived) - 1, na.rm = TRUE)
)## # A tibble: 2 × 3
## Sex avg_Age survival_rate
## <chr> <dbl> <dbl>
## 1 female 27.9 -0.258
## 2 male 30.1 -0.811
# Select only numeric columns
numeric_data <- data[, c("Age","Fare","Pclass","SibSp","Parch")]
# Correlation matrix
cor_matrix <- cor(numeric_data, use="complete.obs")
# Print
print(cor_matrix)## Age Fare Pclass SibSp Parch
## Age 1.00000000 0.09668842 -0.33989833 -0.23329633 -0.17248195
## Fare 0.09668842 1.00000000 -0.54949962 0.15965104 0.21622494
## Pclass -0.33989833 -0.54949962 1.00000000 0.08308136 0.01844267
## SibSp -0.23329633 0.15965104 0.08308136 1.00000000 0.41483770
## Parch -0.17248195 0.21622494 0.01844267 0.41483770 1.00000000
library(reshape2)
ggplot(melt(cor_matrix), aes(x=Var1, y=Var2, fill=value)) +
geom_tile(color="white") +
geom_text(aes(label=round(value,2)), color="black") +
scale_fill_gradient(low="lightblue", high="darkblue") +
labs(title="Correlation Heatmap of Titanic Variables")ggplot(data, aes(x = factor(Pclass), fill = factor(Survived))) +
geom_bar(position = "dodge") +
labs(title = "Survival by Passenger Class", x = "Class", y = "Count", fill = "Survived")ggplot(data, aes(x = Sex, fill = factor(Survived))) +
geom_bar(position = "dodge") +
labs(title = "Survival by Gender", x = "Gender", y = "Count", fill = "Survived")boxplot(Age ~ Survived, data=data,
main="Age Distribution by Survival",
xlab="Survived (0=No, 1=Yes)", ylab="Age",
col=c("lightcoral","lightgreen"))ggplot(data, aes(x=factor(Pclass), fill=factor(Survived))) +
geom_bar(position="stack") +
facet_wrap(~Sex) +
labs(title="Stacked Survival by Class and Gender", x="Class", y="Count", fill="Survived")avg_age <- aggregate(Age ~ Pclass, data = data, FUN = mean, na.rm = TRUE)
ggplot(avg_age, aes(x = Pclass, y = Age)) +
geom_line(group = 1, color = "darkred") +
geom_point(size = 3, color = "black") +
labs(title = "Average Age by Passenger Class", x = "Class", y = "Average Age")ggplot(data, aes(x = Age, y = Fare, color = factor(Survived))) +
geom_point(alpha = 0.6) +
labs(title = "Age vs Fare Colored by Survival", x = "Age", y = "Fare", color = "Survived")ggplot(data, aes(x = Age)) +
geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
labs(title = "Age Distribution of Titanic Passengers",
x = "Age",
y = "Frequency")library(reshape2)
corr <- cor(data[,c("Age","Fare","Pclass")], use="complete.obs")
ggplot(melt(corr), aes(Var1, Var2, fill=value)) +
geom_tile()##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Fare, family = "binomial",
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.6553374 0.5085945 9.153 < 2e-16 ***
## Pclass -1.1529180 0.1355637 -8.505 < 2e-16 ***
## Sexmale -2.6072959 0.1872514 -13.924 < 2e-16 ***
## Age -0.0331244 0.0073991 -4.477 7.58e-06 ***
## Fare 0.0005922 0.0020347 0.291 0.771
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.7 on 890 degrees of freedom
## Residual deviance: 805.5 on 886 degrees of freedom
## AIC: 815.5
##
## Number of Fisher Scoring iterations: 5
# Create predicted probabilities
data$PredictedProb <- predict(model, type="response")
# Plot Age vs Predicted Probability
ggplot(data, aes(x=Age, y=PredictedProb, color=Sex)) +
geom_point(alpha=0.5) +
geom_smooth(method="loess", se=FALSE) +
labs(title="Logistic Regression: Survival Probability by Age",
x="Age", y="Predicted Probability of Survival")## `geom_smooth()` using formula = 'y ~ x'
##
## Call:
## lm(formula = Fare ~ Age, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.90 -23.53 -16.88 1.46 478.04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.3686 4.0919 5.222 2.2e-07 ***
## Age 0.3690 0.1274 2.896 0.00387 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 49.49 on 889 degrees of freedom
## Multiple R-squared: 0.009349, Adjusted R-squared: 0.008234
## F-statistic: 8.389 on 1 and 889 DF, p-value: 0.003867
# Visualization
ggplot(data, aes(x=Age, y=Fare)) +
geom_point(alpha=0.6, color="blue") +
geom_smooth(method="lm", se=FALSE, color="red") +
labs(title="Linear Regression: Fare vs Age",
x="Age", y="Fare")## `geom_smooth()` using formula = 'y ~ x'
# Multiple linear regression
lm_multi <- lm(Fare ~ Age + Pclass + SibSp + Parch, data=data)
summary(lm_multi)##
## Call:
## lm(formula = Fare ~ Age + Pclass + SibSp + Parch, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70.35 -20.53 3.92 6.53 443.07
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 108.9972 6.1697 17.667 < 2e-16 ***
## Age -0.1552 0.1115 -1.393 0.164
## Pclass -34.3020 1.6876 -20.326 < 2e-16 ***
## SibSp 5.8190 1.3433 4.332 1.65e-05 ***
## Parch 10.2516 1.8159 5.645 2.22e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.54 on 886 degrees of freedom
## Multiple R-squared: 0.3698, Adjusted R-squared: 0.3669
## F-statistic: 130 on 4 and 886 DF, p-value: < 2.2e-16
# Visualization: predicted vs actual
data$PredictedFare <- predict(lm_multi, data)
ggplot(data, aes(x=Fare, y=PredictedFare)) +
geom_point(alpha=0.6, color="darkgreen") +
geom_abline(intercept=0, slope=1, color="red") +
labs(title="Multiple Linear Regression: Actual vs Predicted Fare",
x="Actual Fare", y="Predicted Fare")# Survival rate by Gender and Class
analysis <- data %>%
group_by(Sex, Pclass) %>%
summarise(survival_rate = mean(Survived, na.rm = TRUE))## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by Sex and Pclass.
## ℹ Output is grouped by Sex.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(Sex, Pclass))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
## # A tibble: 6 × 3
## # Groups: Sex [2]
## Sex Pclass survival_rate
## <chr> <int> <dbl>
## 1 female 1 0.968
## 2 female 2 0.921
## 3 female 3 0.5
## 4 male 1 0.369
## 5 male 2 0.157
## 6 male 3 0.135
# Train model
model <- glm(Survived ~ Pclass + Sex + Age + Fare,
data = data,
family = "binomial")
summary(model)##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Fare, family = "binomial",
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.6553374 0.5085945 9.153 < 2e-16 ***
## Pclass -1.1529180 0.1355637 -8.505 < 2e-16 ***
## Sexmale -2.6072959 0.1872514 -13.924 < 2e-16 ***
## Age -0.0331244 0.0073991 -4.477 7.58e-06 ***
## Fare 0.0005922 0.0020347 0.291 0.771
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.7 on 890 degrees of freedom
## Residual deviance: 805.5 on 886 degrees of freedom
## AIC: 815.5
##
## Number of Fisher Scoring iterations: 5