library(ggplot2)#  visualizations.
library(psych)# descriptive statistics functions.
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(dplyr)# For data manipulation and wrangling using the pipe operator %>%.
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

1 Load the Titanic Dataset

data <- read.csv("Titanic-Dataset.csv")

2 Explore the data

head(data)# It shows first 6 rows
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  28     0     0
##             Ticket    Fare   Cabin Embarked
## 1        A/5 21171  7.2500 Unknown        S
## 2         PC 17599 71.2833     C85        C
## 3 STON/O2. 3101282  7.9250 Unknown        S
## 4           113803 53.1000    C123        S
## 5           373450  8.0500 Unknown        S
## 6           330877  8.4583 Unknown        Q
describe(data)# it tells about about data
##             vars   n   mean     sd median trimmed    mad  min    max  range
## PassengerId    1 891 446.00 257.35 446.00  446.00 330.62 1.00 891.00 890.00
## Survived       2 891   0.38   0.49   0.00    0.35   0.00 0.00   1.00   1.00
## Pclass         3 891   2.31   0.84   3.00    2.39   0.00 1.00   3.00   2.00
## Name*          4 891 446.00 257.35 446.00  446.00 330.62 1.00 891.00 890.00
## Sex*           5 891   1.65   0.48   2.00    1.68   0.00 1.00   2.00   1.00
## Age            6 891  29.36  13.02  28.00   28.83   8.90 0.42  80.00  79.58
## SibSp          7 891   0.52   1.10   0.00    0.27   0.00 0.00   8.00   8.00
## Parch          8 891   0.38   0.81   0.00    0.18   0.00 0.00   6.00   6.00
## Ticket*        9 891 339.52 200.83 338.00  339.65 268.35 1.00 681.00 680.00
## Fare          10 891  32.20  49.69  14.45   21.38  10.24 0.00 512.33 512.33
## Cabin*        11 891 131.74  36.02 148.00  141.65   0.00 1.00 148.00 147.00
## Embarked*     12 891   2.54   0.79   3.00    2.67   0.00 1.00   3.00   2.00
##              skew kurtosis   se
## PassengerId  0.00    -1.20 8.62
## Survived     0.48    -1.77 0.02
## Pclass      -0.63    -1.28 0.03
## Name*        0.00    -1.20 8.62
## Sex*        -0.62    -1.62 0.02
## Age          0.51     0.97 0.44
## SibSp        3.68    17.73 0.04
## Parch        2.74     9.69 0.03
## Ticket*      0.00    -1.28 6.73
## Fare         4.77    33.12 1.66
## Cabin*      -2.16     3.38 1.21
## Embarked*   -1.26    -0.22 0.03

3 Structure of table

str(data)# View dataset structure
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 28 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "Unknown" "C85" "Unknown" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

4 Summary of table

summary(data)# Summary statistics
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:22.00   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.36   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:35.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33

5 Dimension of table: It check how many rows and column

dim(data) # It gives how many rows and column
## [1] 891  12

6 Checking missing values`

colSums(is.na(data))   # count of NA values per column
## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0           0 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0           0           0
anyNA(data)            # quick check if any NA exists
## [1] FALSE

7 How many passengers survived vs. died?

table(data$Survived)                        # count survived vs died
## 
##   0   1 
## 549 342

#Survival Proportions by Gender and class (using prop.table()

prop.table(table(data$Sex, data$Survived),1) # survival rate by gender, this gives percentage
##         
##                  0         1
##   female 0.2579618 0.7420382
##   male   0.8110919 0.1889081
prop.table(table(data$Pclass, data$Survived),1) # survival rate by class
##    
##             0         1
##   1 0.3703704 0.6296296
##   2 0.5271739 0.4728261
##   3 0.7576375 0.2423625

8 Distribution of fare

mean(data$Fare, na.rm = TRUE) # average fare
## [1] 32.20421
median(data$Fare, na.rm = TRUE) #median fare
## [1] 14.4542
range(data$Fare, na.rm = TRUE)# min and max fare
## [1]   0.0000 512.3292

9 Arrange: Sort passengers by Age (descending)

head(data %>%
  arrange(desc(Age)) %>%
  select(Name, Age, Survived, Pclass))
##                                   Name  Age Survived Pclass
## 1 Barkworth, Mr. Algernon Henry Wilson 80.0        1      1
## 2                  Svensson, Mr. Johan 74.0        0      3
## 3            Goldschmidt, Mr. George B 71.0        0      1
## 4              Artagaveytia, Mr. Ramon 71.0        0      1
## 5                 Connors, Mr. Patrick 70.5        0      3
## 6          Mitchell, Mr. Henry Michael 70.0        0      2

10 Aggregate analysis Average Age by Passenger Class

aggregate(Age ~ Pclass, data = data, FUN = mean, na.rm = TRUE) #average age by class.
##   Pclass      Age
## 1      1 36.81213
## 2      2 29.76538
## 3      3 25.93263

11 Group by Summarize: Group by Passenger Class

data %>%
  group_by(Pclass) %>%
  summarise(
    avg_Age     = mean(Age, na.rm = TRUE),
    avg_Fare    = mean(Fare, na.rm = TRUE),
    survival_rate = mean(as.numeric(Survived) - 1, na.rm = TRUE)
  )
## # A tibble: 3 × 4
##   Pclass avg_Age avg_Fare survival_rate
##    <int>   <dbl>    <dbl>         <dbl>
## 1      1    36.8     84.2        -0.370
## 2      2    29.8     20.7        -0.527
## 3      3    25.9     13.7        -0.758

12 Find Class with Highest Survival Rate

data %>%
  group_by(Pclass) %>%
  summarise(survival_rate = mean(as.numeric(Survived) - 1, na.rm = TRUE)) %>%
  arrange(desc(survival_rate)) %>%
  slice(1)
## # A tibble: 1 × 2
##   Pclass survival_rate
##    <int>         <dbl>
## 1      1        -0.370

13 Categorize Age into Child, Adult, Senior

data <- data %>%
  mutate(age_category = ifelse(Age >= 60, "Senior",
                        ifelse(Age < 18, "Child", "Adult")))

head(data[, c("Name", "Age", "age_category")])
##                                                  Name Age age_category
## 1                             Braund, Mr. Owen Harris  22        Adult
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer)  38        Adult
## 3                              Heikkinen, Miss. Laina  26        Adult
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel)  35        Adult
## 5                            Allen, Mr. William Henry  35        Adult
## 6                                    Moran, Mr. James  28        Adult

14 Group by Sex + Summarise

data %>%
  group_by(Sex) %>%
  summarise(
    avg_Age = mean(Age, na.rm = TRUE),
    survival_rate = mean(as.numeric(Survived) - 1, na.rm = TRUE)
  )
## # A tibble: 2 × 3
##   Sex    avg_Age survival_rate
##   <chr>    <dbl>         <dbl>
## 1 female    27.9        -0.258
## 2 male      30.1        -0.811

15 Correlation Between Fare and Class: Check correlation and visualize.

# Select only numeric columns
numeric_data <- data[, c("Age","Fare","Pclass","SibSp","Parch")]

# Correlation matrix
cor_matrix <- cor(numeric_data, use="complete.obs")

# Print
print(cor_matrix)
##                Age        Fare      Pclass       SibSp       Parch
## Age     1.00000000  0.09668842 -0.33989833 -0.23329633 -0.17248195
## Fare    0.09668842  1.00000000 -0.54949962  0.15965104  0.21622494
## Pclass -0.33989833 -0.54949962  1.00000000  0.08308136  0.01844267
## SibSp  -0.23329633  0.15965104  0.08308136  1.00000000  0.41483770
## Parch  -0.17248195  0.21622494  0.01844267  0.41483770  1.00000000

15.1 Visualization of correlation

library(reshape2)

ggplot(melt(cor_matrix), aes(x=Var1, y=Var2, fill=value)) +
  geom_tile(color="white") +
  geom_text(aes(label=round(value,2)), color="black") +
  scale_fill_gradient(low="lightblue", high="darkblue") +
  labs(title="Correlation Heatmap of Titanic Variables")

16 grouped bar chart Visualization: show how Survival varied across Passenger Class

ggplot(data, aes(x = factor(Pclass), fill = factor(Survived))) +
  geom_bar(position = "dodge") +
  labs(title = "Survival by Passenger Class", x = "Class", y = "Count", fill = "Survived")

17 grouped bar chart Visualization: Highlights the difference between male and female survival rates

ggplot(data, aes(x = Sex, fill = factor(Survived))) +
  geom_bar(position = "dodge") +
  labs(title = "Survival by Gender", x = "Gender", y = "Count", fill = "Survived")

18 Boxplot: Shows age distribution among survivors and non‑survivors

boxplot(Age ~ Survived, data=data,
        main="Age Distribution by Survival",
        xlab="Survived (0=No, 1=Yes)", ylab="Age",
        col=c("lightcoral","lightgreen"))

19 Stacked Bar Chart: Survival by Class and Gender

ggplot(data, aes(x=factor(Pclass), fill=factor(Survived))) +
  geom_bar(position="stack") +
  facet_wrap(~Sex) +
  labs(title="Stacked Survival by Class and Gender", x="Class", y="Count", fill="Survived")

20 Line plot(Average Age by Passenger Class):Displays how average age changes across classes.

avg_age <- aggregate(Age ~ Pclass, data = data, FUN = mean, na.rm = TRUE)

ggplot(avg_age, aes(x = Pclass, y = Age)) +
  geom_line(group = 1, color = "darkred") +
  geom_point(size = 3, color = "black") +
  labs(title = "Average Age by Passenger Class", x = "Class", y = "Average Age")

21 Scatter plot (Age vs Fare):Shows relationship between age, fare, and survival status

ggplot(data, aes(x = Age, y = Fare, color = factor(Survived))) +
  geom_point(alpha = 0.6) +
  labs(title = "Age vs Fare Colored by Survival", x = "Age", y = "Fare", color = "Survived")

22 Histogram (Age distribution of passengers):Displays how passenger ages were spread out

ggplot(data, aes(x = Age)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  labs(title = "Age Distribution of Titanic Passengers",
       x = "Age",
       y = "Frequency")

23 Heatmap: Correlation among numeric variables.

library(reshape2)
corr <- cor(data[,c("Age","Fare","Pclass")], use="complete.obs")
ggplot(melt(corr), aes(Var1, Var2, fill=value)) +
  geom_tile()

24 Logistic regression model: Logistic regression shows that sex and class are strong predictors, while age and fare have weaker effects. Coefficients quantify the odds of survival.

model <- glm(Survived ~ Pclass + Sex + Age + Fare,
             data=data, family="binomial")
summary(model)
## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Fare, family = "binomial", 
##     data = data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.6553374  0.5085945   9.153  < 2e-16 ***
## Pclass      -1.1529180  0.1355637  -8.505  < 2e-16 ***
## Sexmale     -2.6072959  0.1872514 -13.924  < 2e-16 ***
## Age         -0.0331244  0.0073991  -4.477 7.58e-06 ***
## Fare         0.0005922  0.0020347   0.291    0.771    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1186.7  on 890  degrees of freedom
## Residual deviance:  805.5  on 886  degrees of freedom
## AIC: 815.5
## 
## Number of Fisher Scoring iterations: 5

25 Visualization of logistic regression model

# Create predicted probabilities
data$PredictedProb <- predict(model, type="response")

# Plot Age vs Predicted Probability
ggplot(data, aes(x=Age, y=PredictedProb, color=Sex)) +
  geom_point(alpha=0.5) +
  geom_smooth(method="loess", se=FALSE) +
  labs(title="Logistic Regression: Survival Probability by Age",
       x="Age", y="Predicted Probability of Survival")
## `geom_smooth()` using formula = 'y ~ x'

26 Linear Regression: Regression line shows weak correlation — age does not strongly predict fare. This confirms fare was more tied to class than age.

# Simple linear regression
lm_simple <- lm(Fare ~ Age, data=data)
summary(lm_simple)
## 
## Call:
## lm(formula = Fare ~ Age, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -40.90 -23.53 -16.88   1.46 478.04 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  21.3686     4.0919   5.222  2.2e-07 ***
## Age           0.3690     0.1274   2.896  0.00387 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49.49 on 889 degrees of freedom
## Multiple R-squared:  0.009349,   Adjusted R-squared:  0.008234 
## F-statistic: 8.389 on 1 and 889 DF,  p-value: 0.003867
# Visualization
ggplot(data, aes(x=Age, y=Fare)) +
  geom_point(alpha=0.6, color="blue") +
  geom_smooth(method="lm", se=FALSE, color="red") +
  labs(title="Linear Regression: Fare vs Age",
       x="Age", y="Fare")
## `geom_smooth()` using formula = 'y ~ x'

27 Multiple linear Regression: Adding predictors (age, class, family size) improves prediction. The actual vs. predicted plot shows model fit.

# Multiple linear regression
lm_multi <- lm(Fare ~ Age + Pclass + SibSp + Parch, data=data)
summary(lm_multi)
## 
## Call:
## lm(formula = Fare ~ Age + Pclass + SibSp + Parch, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -70.35 -20.53   3.92   6.53 443.07 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 108.9972     6.1697  17.667  < 2e-16 ***
## Age          -0.1552     0.1115  -1.393    0.164    
## Pclass      -34.3020     1.6876 -20.326  < 2e-16 ***
## SibSp         5.8190     1.3433   4.332 1.65e-05 ***
## Parch        10.2516     1.8159   5.645 2.22e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39.54 on 886 degrees of freedom
## Multiple R-squared:  0.3698, Adjusted R-squared:  0.3669 
## F-statistic:   130 on 4 and 886 DF,  p-value: < 2.2e-16
# Visualization: predicted vs actual
data$PredictedFare <- predict(lm_multi, data)

ggplot(data, aes(x=Fare, y=PredictedFare)) +
  geom_point(alpha=0.6, color="darkgreen") +
  geom_abline(intercept=0, slope=1, color="red") +
  labs(title="Multiple Linear Regression: Actual vs Predicted Fare",
       x="Actual Fare", y="Predicted Fare")

28 Analysis

# Survival rate by Gender and Class
analysis <- data %>%
  group_by(Sex, Pclass) %>%
  summarise(survival_rate = mean(Survived, na.rm = TRUE))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by Sex and Pclass.
## ℹ Output is grouped by Sex.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(Sex, Pclass))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.
analysis
## # A tibble: 6 × 3
## # Groups:   Sex [2]
##   Sex    Pclass survival_rate
##   <chr>   <int>         <dbl>
## 1 female      1         0.968
## 2 female      2         0.921
## 3 female      3         0.5  
## 4 male        1         0.369
## 5 male        2         0.157
## 6 male        3         0.135

29 Model cleaning (Logistic Regression)

# Data cleaning (IMPORTANT)
data$Survived <- as.numeric(as.character(data$Survived))
data$Sex <- as.factor(data$Sex)
data$Age[is.na(data$Age)] <- mean(data$Age, na.rm = TRUE)

30 Model Training

# Train model
model <- glm(Survived ~ Pclass + Sex + Age + Fare,
             data = data,
             family = "binomial")

summary(model)
## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Fare, family = "binomial", 
##     data = data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.6553374  0.5085945   9.153  < 2e-16 ***
## Pclass      -1.1529180  0.1355637  -8.505  < 2e-16 ***
## Sexmale     -2.6072959  0.1872514 -13.924  < 2e-16 ***
## Age         -0.0331244  0.0073991  -4.477 7.58e-06 ***
## Fare         0.0005922  0.0020347   0.291    0.771    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1186.7  on 890  degrees of freedom
## Residual deviance:  805.5  on 886  degrees of freedom
## AIC: 815.5
## 
## Number of Fisher Scoring iterations: 5

31 Predict for a NEW passenger

# New passenger data
new_passenger <- data.frame(
  Pclass = 1,
  Sex = "female",
  Age = 25,
  Fare = 100
)

# Predict probability
prob <- predict(model, newdata = new_passenger, type = "response")

prob
##         1 
## 0.9389764
ifelse(prob > 0.5, "Survived", "Not Survived")  # convert survival prediction
##          1 
## "Survived"