## Seed 2066 set for reproducibility
## Training set size: 712
## Testing set size: 179
Show Initial Missing Values.
##
## Working Set Size: 712
## Note the equivalent split size at initial load time in Step 1.
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 140
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 557 2
## ####################################################################################
# Derive Features using various techniques
cat("\nFeature Generation: ","Title","\n")
##
## Feature Generation: Title
df$Title <- gsub("^.*,\\s*|\\..*$", "", df$Name)
df$Title <- as.factor(as.character(df$Title))
unique(df$Title)
## [1] Miss Mr Mrs Master Mme
## [6] Dr Col Rev Mlle the Countess
## [11] Lady Jonkheer Sir Major Capt
## [16] Don Ms
## 17 Levels: Capt Col Don Dr Jonkheer Lady Major Master Miss Mlle Mme Mr ... the Countess
cat("\nFeature Generation: ","FamilySize","\n")
##
## Feature Generation: FamilySize
df$FamilySize <- df$SibSp + df$Parch + 1
cat("\nFeature Generation: ","IsAlone","\n")
##
## Feature Generation: IsAlone
df$IsAlone <- ifelse(df$FamilySize == 1, 1, 0)
df$IsAlone <- cut(df$IsAlone, breaks=c(-1,0,1), labels=c("NO","YES"))
df$IsAlone <- as.factor(as.character(df$IsAlone))
cat("\nFeature Generation: ","AgeGroup","\n")
##
## Feature Generation: AgeGroup
df$AgeGroup <- cut(df$Age, breaks = c(-Inf, 12, 18, 60, Inf),
labels = c("Child", "Teenager", "Adult", "Senior"))
cat("\nFeature Generation: ","FarePP","\n")
##
## Feature Generation: FarePP
df$FarePP <- df$Fare / df$FamilySize
cat("\nFeature Generation: ","Fare_Age","\n")
##
## Feature Generation: Fare_Age
df$Fare_Age <- df$Fare * df$Age
Perform Pre-processing, Imputation on TRAINING Set First. Show Filtered/Cleaned Data.
## Survived Pclass Name Sex Age SibSp Parch
## 0 0 0 0 0 0 0
## Fare Embarked Title FamilySize IsAlone AgeGroup FarePP
## 0 0 0 0 0 0 0
## Fare_Age
## 0
## ####################################################################################
## Survived Pclass Name Sex Age
## 717 YES First Endres, Miss. Caroline Louise female 38.00000
## 602 NO Third Slabenoff, Mr. Petco male 29.28977
## 513 YES First McGough, Mr. James Robert male 36.00000
## 195 YES First Brown, Mrs. James Joseph (Margaret Tobin) female 44.00000
## 230 NO Third Lefebre, Miss. Mathilde female 29.28977
## 71 NO Second Jenkin, Mr. Stephen Curnow male 32.00000
## SibSp Parch Fare Embarked Title FamilySize IsAlone AgeGroup FarePP
## 717 0 0 227.5250 C Miss 1 YES Adult 227.52500
## 602 0 0 7.8958 S Mr 1 YES Adult 7.89580
## 513 0 0 26.2875 S Mr 1 YES Adult 26.28750
## 195 0 0 27.7208 C Mrs 1 YES Adult 27.72080
## 230 3 1 25.4667 S Miss 5 NO Adult 5.09334
## 71 0 0 10.5000 S Mr 1 YES Adult 10.50000
## Fare_Age
## 717 8645.950
## 602 1063.993
## 513 946.350
## 195 1219.715
## 230 1063.993
## 71 336.000
## ####################################################################################
##
## Take a Peek at Categorical Features
## ####################################################################################
## #################################################################
## Check Stats & Distributions of Data Set
## #################################################################
##
## Assessment Statistics for Data Set
## Var Obs Mean Median Variance St.Dev Range IQR Skewness
## 1 Age 712 29.29 29.29 164.98 12.84 79.33 12 0.51
## 2 SibSp 712 0.55 0 1.39 1.18 8 1 3.58
## 3 Parch 712 0.38 0 0.64 0.8 6 0 2.65
## 4 Fare 712 31.84 14.43 2395.7 48.95 512.33 23.17 4.66
## 5 FamilySize 712 1.94 1 2.87 1.7 10 1 2.7
## 6 FarePP 712 19.83 8.15 1389.1 37.27 512.33 11.77 8.07
## 7 Fare_Age 712 1063.99 630.2 2729757.14 1652.2 17931.52 838.89 5.51
## Kurtosis Outliers
## 1 4.1 60
## 2 19.11 43
## 3 12.15 170
## 4 34.32 92
## 5 11.68 76
## 6 93.31 77
## 7 47.18 69
## ####################################################################################
## Quantiles Data Frame of Quantitative Variables
## qAge qFare qSibSp qParch qFamilySize qFarePP qFare_Age
## 0% 0.7 0.0 0 0 1 0.0 0.0
## 5% 6.0 7.2 0 0 1 4.1 109.3
## 25% 22.0 7.9 0 0 1 7.2 225.1
## 50% 29.3 14.4 0 0 1 8.1 630.2
## 75% 34.0 31.1 1 0 2 19.0 1064.0
## 95% 54.0 109.8 3 2 6 69.3 3772.8
## 10% 16.0 7.6 0 0 1 5.1 153.0
## ####################################################################################
##
## Plots of Data Set for Assessment
##
## See Appendix for Alternative Box-Cox Transformation for (Fare).
##
## Plots of TRANSFORMED Data Set
## ####################################################################################
## #########################################################
## Check First Few Rows of SCALED Vars in Data Set
## #########################################################
##
## First few rows of the scaled variables:
## Age_scaled Fare_scaled SibSp_scaled FamilySize_scaled FarePP_scaled
## 717 0.8117058 0.8703281 0.0000000 0.0000000 0.8703281
## 602 0.7465910 0.3502016 0.0000000 0.0000000 0.3502016
## 513 0.7981435 0.5297985 0.0000000 0.0000000 0.5297985
## 195 0.8485720 0.5380013 0.0000000 0.0000000 0.5380013
## 230 0.7465910 0.5249048 0.6123724 0.5335642 0.2895722
## 71 0.7686687 0.3913442 0.0000000 0.0000000 0.3913442
## FareAge_scaled
## 717 0.9255277
## 602 0.7117071
## 513 0.6997559
## 195 0.7256405
## 230 0.7117071
## 71 0.5942273
##
## Plot of Min-Max SCALED Variables:
## ####################################################################################
## Save the Final Transformed and Scaled Data Set
## ####################################################################################
.
Here we call the same functions and run all the same steps above
using ONLY the TEST Set
This clear separation helps to avoid data leakage, overfitting
and minimize bias
.
Show Initial Missing Values.
##
## Working Set Size: 179
## Note the equivalent split size at initial load time in Step 1.
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 37
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 130 0
## ####################################################################################
# Derive Features using various techniques
cat("\nFeature Generation: ","Title","\n")
##
## Feature Generation: Title
df$Title <- gsub("^.*,\\s*|\\..*$", "", df$Name)
df$Title <- as.factor(as.character(df$Title))
unique(df$Title)
## [1] Mr Mrs Miss Master Dr Col
## Levels: Col Dr Master Miss Mr Mrs
cat("\nFeature Generation: ","FamilySize","\n")
##
## Feature Generation: FamilySize
df$FamilySize <- df$SibSp + df$Parch + 1
cat("\nFeature Generation: ","IsAlone","\n")
##
## Feature Generation: IsAlone
df$IsAlone <- ifelse(df$FamilySize == 1, 1, 0)
df$IsAlone <- cut(df$IsAlone, breaks=c(-1,0,1), labels=c("NO","YES"))
df$IsAlone <- as.factor(as.character(df$IsAlone))
cat("\nFeature Generation: ","AgeGroup","\n")
##
## Feature Generation: AgeGroup
df$AgeGroup <- cut(df$Age, breaks = c(-Inf, 12, 18, 60, Inf),
labels = c("Child", "Teenager", "Adult", "Senior"))
cat("\nFeature Generation: ","FarePP","\n")
##
## Feature Generation: FarePP
df$FarePP <- df$Fare / df$FamilySize
cat("\nFeature Generation: ","Fare_Age","\n")
##
## Feature Generation: Fare_Age
df$Fare_Age <- df$Fare * df$Age
Perform Pre-processing, Imputation on TEST Set. Show Filtered/Cleaned Data.
## Survived Pclass Name Sex Age SibSp Parch
## 0 0 0 0 0 0 0
## Fare Embarked Title FamilySize IsAlone AgeGroup FarePP
## 0 0 0 0 0 0 0
## Fare_Age
## 0
## ####################################################################################
## Survived Pclass Name Sex Age SibSp
## 6 NO Third Moran, Mr. James male 31.34803 0
## 10 YES Second Nasser, Mrs. Nicholas (Adele Achem) female 14.00000 1
## 11 YES Third Sandstrom, Miss. Marguerite Rut female 4.00000 1
## 12 YES First Bonnell, Miss. Elizabeth female 58.00000 0
## 14 NO Third Andersson, Mr. Anders Johan male 39.00000 1
## 18 YES Second Williams, Mr. Charles Eugene male 31.34803 0
## Parch Fare Embarked Title FamilySize IsAlone AgeGroup FarePP Fare_Age
## 6 0 8.4583 Q Mr 1 YES Adult 8.458300 1265.8672
## 10 0 30.0708 C Mrs 2 NO Teenager 15.035400 420.9912
## 11 1 16.7000 S Miss 3 NO Child 5.566667 66.8000
## 12 0 26.5500 S Miss 1 YES Adult 26.550000 1539.9000
## 14 5 31.2750 S Mr 7 NO Adult 4.467857 1219.7250
## 18 0 13.0000 S Mr 1 YES Adult 13.000000 1265.8672
## ####################################################################################
##
## Take a Peek at Categorical Features
## ####################################################################################
## #################################################################
## Check Stats & Distributions of Data Set
## #################################################################
##
## Assessment Statistics for Data Set
## Var Obs Mean Median Variance St.Dev Range IQR Skewness
## 1 Age 179 31.35 31.35 183.58 13.55 73.58 15 0.15
## 2 SibSp 179 0.4 0 0.5 0.71 4 1 2.62
## 3 Parch 179 0.37 0 0.68 0.83 5 0 3.09
## 4 Fare 179 33.66 14.5 2775.19 52.68 512.33 22.77 5.13
## 5 FamilySize 179 1.77 1 1.51 1.23 6 1 2.24
## 6 FarePP 179 20.26 8.46 874.24 29.57 256.16 18.92 4.5
## 7 Fare_Age 179 1265.87 708.4 4291568.49 2071.61 18443.85 1017.8 4.79
## Kurtosis Outliers
## 1 3.59 6
## 2 12.44 3
## 3 14.69 43
## 4 41.24 24
## 5 8.62 15
## 6 29.47 10
## 7 33.45 18
## ####################################################################################
## Quantiles Data Frame of Quantitative Variables
## qAge qFare qSibSp qParch qFamilySize qFarePP qFare_Age
## 0% 0.4 0.0 0 0 1 0.0 0.0
## 5% 4.9 7.0 0 0 1 4.5 92.5
## 25% 24.0 7.9 0 0 1 7.5 248.1
## 50% 31.3 14.5 0 0 1 8.5 708.4
## 75% 39.0 30.7 1 0 2 26.4 1265.9
## 95% 55.1 121.4 1 2 4 55.1 4320.4
## 10% 16.8 7.5 0 0 1 5.5 142.5
## ####################################################################################
##
## Plots of Data Set for Assessment
##
## See Appendix for Alternative Box-Cox Transformation for (Fare).
##
## Plots of TRANSFORMED Data Set
## ####################################################################################
## #########################################################
## Check First Few Rows of SCALED Vars in Data Set
## #########################################################
##
## First few rows of the scaled variables:
## Age_scaled Fare_scaled SibSp_scaled FamilySize_scaled FarePP_scaled
## 6 0.7880084 0.3600260 0.0 0.0000000 0.4048662
## 10 0.5942762 0.5506031 0.5 0.2516866 0.4999893
## 11 0.3173266 0.4604395 0.5 0.4448125 0.3391176
## 12 0.9395107 0.5313326 0.0 0.0000000 0.5975085
## 14 0.8415338 0.5566959 0.5 1.0000000 0.3061214
## 18 0.7880084 0.4228637 0.0 0.0000000 0.4755301
## FareAge_scaled
## 6 0.7273375
## 10 0.6154196
## 11 0.4292741
## 12 0.7472733
## 14 0.7235603
## 18 0.7273375
##
## Plot of Min-Max SCALED Variables:
## ####################################################################################
## Save the Final Transformed and Scaled Data Set
## ####################################################################################
We move into full-feature engineering techniques and applications for this EDA of the Titanic.csv data set. Just before we go to the 14th and final step in our EDA algorithm, we carefully split the data, avoid leakage, and then derive and apply new column variables to aid in initiating model building.
Data Preparation
Splitting the dataset into training and testing samples BEFORE any data prep, missing data handling or imputation is essential to prevent overfitting and ensure the model performs well on unseen data. By training on one part (the training set) and testing on another (the testing set), we can obtain reliable performance metrics like accuracy and precision. This approach also prevents data leakage, where the testing data influences training, leading to overly optimistic results.
Splitting the Data
To ensure reliable evaluation, the Titanic dataset was split into 80% training and 20% testing samples using random splitting with a set seed. Splitting the dataset into training and testing samples ensures the model generalizes well to new, unseen data rather than just memorizing the training data. Testing on a separate dataset allows us to calculate reliable metrics such as accuracy, precision, recall, or root mean square error, depending on the problem type. Splitting prevents data leakage, where information from the testing set influences the training phase, leading to overly optimistic model performance. Random splitting with a set seed ensures consistent results across multiple runs, allowing comparisons and debugging.
Generating New Features. Assessing and Applying Transformations “Separately”
Several feature engineering techniques can be considered and applied in an EDA, including those explored and applied in the studies, such as (1) handling missing values with mean and mode imputation, (2) transformations like log, square root, or box-cox, (3) outlier handling, such as removing extreme values, and (4) scaling and normalization.
Some feature engineering techniques we can consider and explore in this study include feature aggregation, binning-and-discretization, and derived features like title extraction. Other techniques that will not apply to this study but may be applicable to different data sets at different times include polynomial features, time-based features, textual features, dimensional reduction, and feature encoding, such as possibly using the one-hot technique of encoding to create binary indicators for the categories.
Feature engineering brings enhanced predictive power, improved interpretability, handling of nonlinear relationships, reduced noise, and domain-specific insights. New variables like Title, FamilySize, and IsAlone add meaningful information that could correlate strongly with survival. Grouping variables (e.g., AgeGroup) makes insights more understandable for stakeholders. Features like FarePerPerson normalize relationships and capture nuanced information (e.g., socio-economic status per individual). Binary variables like IsAlone simplify analysis and reduce noise in the data. Titles, family dynamics, and per-person fare align with real-world survival factors on the Titanic.
Perform and Apply Min-Max Scaling
Normalization is still applied at the end of transforming each set separately, training set first, then test set. This type of normalization, transforms all variable values into a range of 0 to 1. This is particularly useful for several reasons, including feature comparability, improved model convergence, maintenance of relationships, and outlier sensitivity.
Importance of Feature Engineering for Training and Test Data
Feature engineering is a vital step when it comes to processing data. It enhances a machine-learning model’s predictive power and quality. When a dataset is split into test and training sets, applying the same transformations to the test dataset as first applied to the training dataset is critical. This ensures consistency, fairness, and the validity of model evaluation. This consistent application is crucial to ensure consistent data representations, prevent data leakage, enable fair model evaluation, avoid computational errors, and preserve model interpretability, which is necessary for deployment consistency.
Ensure Consistent Data Representation: The model trained on the training dataset expects the same structure, range, and scale of features during testing and deployment. Differences in feature engineering between training and test sets lead to mismatched data representations, causing the model to perform poorly on unseen data. For example, if Fare is log-transformed in the training dataset but not in the test dataset, the test data will contain larger numerical values, leading to errors or poor predictions since the model has not been trained to interpret untransformed values.
Prevent Data Leakage: If the test set is treated differently, it can introduce data leakage, leading to overly optimistic results. To prevent this, transformations should be applied using only the training dataset. In a prior study by this author of the Titanic dataset, the mean age for imputation was calculated with the test data included, causing biased model evaluation. Missing data, primarily in the Cabin variable (79%) of all missing values and minimal overall, leading to the decision to remove it. (Hinton, 2024) However, the best practices in this study suggest handling missing data after splitting the sets and applying imputations consistently and distinctly to training and test datasets.
Enable Fair Model Evaluation: The test dataset represents unseen data that simulate real-world scenarios. It should undergo the same preprocessing steps to evaluate how well the model generalizes. Any deviation between training and test preprocessing creates an unfair testing environment, leading to unreliable performance metrics. Suppose Age is categorized into bins (e.g., Child, Teenager, Adult, Senior) based on thresholds derived only from the training data. Applying different thresholds to the test data would result in mismatched categories, skewing the evaluation.
Avoid Computational Errors: Many machine learning models require consistent input dimensions, formats, and scales. If feature engineering is not consistently applied, the model may fail to make predictions or produce invalid outputs. For example, if FamilySize is created as a derived feature in the training set but not in the test set, the test set will lack this variable, causing an error during model evaluation or inference.
Preserve Model Interpretability: Feature engineering transforms raw data into a format aligning with the underlying assumptions of the model. Applying inconsistent transformations disrupts this alignment, making model outputs harder to interpret. If min-max scaling were only applied to Fare in the training data but not in the test data, predictions involving Fare would be based on incomparable scales, reducing the model’s interpretability.
Deployment Consistency: The transformations applied during training must also be applied during model deployment to ensure that new incoming data is processed like the training data. If a log transformation is applied to Fare during training but not during deployment, predictions on live data will be inaccurate because the incoming data will not match the trained model’s expectations.
Summary
By applying techniques like feature aggregation, binning, and derived features, the Titanic dataset was transformed into a model-ready format. Consistently applying these techniques across training and test datasets ensured fairness, prevented data leakage, and enhanced predictive modeling accuracy.
References
Shah, T. (2017, December 6). About train, validation and test sets in machine learning. Medium; Towards Data Science. https://towardsdatascience.com/train-validation-and-test-sets-72cb40cba9e7 .
Dunning, T. (2020). Practical feature engineering (O’Reilly Media). https://learning.oreilly.com/videos/practical-feature-engineering/0636920371823 .
Wang, K., Wang, P., & Xu, C. (2022). Toward efficient automated feature engineering. ArXiv. /abs/2212.13152. https://arxiv.org/abs/2212.13152 .
Venables, W. N. and Ripley, B. D. (2002) Modern Applied Statistics with S. Fourth edition. Springer. https://link.springer.com/book/10.1007/978-0-387-21706-2
Hinton, W. (2024). Split, Transform, and Scale the Data Set. Available at Rpubs. https://www.rpubs.com/whinton/.
Packt Publishing. (2018). R programming for statistics and data science (Media from Packt Publishing available freely through O’Reilly Media Inc.). https://learning.oreilly.com/course/r-programming-for/9781789950298/
Datar, R., & Garg, H. (2019). Hands-on exploratory data analysis with R: Become an expert in exploratory data analysis using R packages. O’Reilly Media, Inc.
Prabhakaran, S. (2023). The complete ggplot2 tutorial. R-statistics.co. Available online _{link}(https://r-statistics.co/Complete-Ggplot2-Tutorial-Part1-With-R-Code.html)_
Smeaton, A. (2003). NIST/SEMATECH Engineering Statistics Handbook. _{link}(https://www.itl.nist.gov/div898/handbook/)_. R Programming for Statistics and Data Science (Media from Packt Publishing available freely through O’Reilly Media Inc.). (2018).
.
This study conducted and performed by Will Hinton
# Box-Cox transformation (requires no zero values in data)
# The Box Cox Transformation in R is the technique used to transform non-normal
# data to a normal distribution by applying the power transformation.
# This transformation is commonly used in statistical modeling to improve
# the normality of the data and to stabilize the variance
cat("\nAlternative Box-Cox Transformation for Fare (See ?boxcox() documentation):","\n")
##
## Alternative Box-Cox Transformation for Fare (See ?boxcox() documentation):
fare_no_zeros <- titanic$Fare + 1 # To handle 0 values
boxcox_fare <- boxcox(lm(fare_no_zeros ~ 1), lambda = seq(-2, 2, by = 0.1))
optimal_lambda <- boxcox_fare$x[which.max(boxcox_fare$y)]
titanic$Fare_boxcox <- (fare_no_zeros^optimal_lambda - 1) / optimal_lambda
# After transformation
pFareboxcox <- ggplot(titanic, aes(x = Fare_boxcox)) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "green", color = "black") +
ggtitle("BoxCox Transformed Fare Distribution") +
xlab("Transformed Fare") +
ylab("Density") +
theme_minimal()
plot(pFareboxcox)