Set the path and import the .csv file containing the data set (titanic.csv). Account for a header row, specify the separator/delimiter as comma, and treat strings as factor (qualitative) variables at the outset.
setwd("/Users/whinton/src/rstudio/tim8501")
titanic <- read.csv("titanic.csv", header = TRUE, sep= ",",stringsAsFactors = TRUE)
df <- titanic ## make copy of original dataset to data frame df
Check the number of objects as rows using nrow(), and number of columns as lenth() or ncol().
cat("Number of rows:", nrow(df), "Number of cols:",length(df),"\n")
## Number of rows: 891 Number of cols: 12
summary(df)
## PassengerId Survived Pclass
## Min. : 1.0 Min. :0.0000 Min. :1.000
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000
## Median :446.0 Median :0.0000 Median :3.000
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Name Sex Age
## Abbing, Mr. Anthony : 1 female:314 Min. : 0.42
## Abbott, Mr. Rossmore Edward : 1 male :577 1st Qu.:20.12
## Abbott, Mrs. Stanton (Rosa Hunt) : 1 Median :28.00
## Abelson, Mr. Samuel : 1 Mean :29.70
## Abelson, Mrs. Samuel (Hannah Wizosky): 1 3rd Qu.:38.00
## Adahl, Mr. Mauritz Nils Martin : 1 Max. :80.00
## (Other) :885 NA's :177
## SibSp Parch Ticket Fare
## Min. :0.000 Min. :0.0000 1601 : 7 Min. : 0.00
## 1st Qu.:0.000 1st Qu.:0.0000 347082 : 7 1st Qu.: 7.91
## Median :0.000 Median :0.0000 CA. 2343: 7 Median : 14.45
## Mean :0.523 Mean :0.3816 3101295 : 6 Mean : 32.20
## 3rd Qu.:1.000 3rd Qu.:0.0000 347088 : 6 3rd Qu.: 31.00
## Max. :8.000 Max. :6.0000 CA 2144 : 6 Max. :512.33
## (Other) :852
## Cabin Embarked
## :687 : 2
## B96 B98 : 4 C:168
## C23 C25 C27: 4 Q: 77
## G6 : 4 S:644
## C22 C26 : 3
## D : 3
## (Other) :186
str(df)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
psych::describe(df)
## vars n mean sd median trimmed mad min max range
## PassengerId 1 891 446.00 257.35 446.00 446.00 330.62 1.00 891.00 890.00
## Survived 2 891 0.38 0.49 0.00 0.35 0.00 0.00 1.00 1.00
## Pclass 3 891 2.31 0.84 3.00 2.39 0.00 1.00 3.00 2.00
## Name* 4 891 446.00 257.35 446.00 446.00 330.62 1.00 891.00 890.00
## Sex* 5 891 1.65 0.48 2.00 1.68 0.00 1.00 2.00 1.00
## Age 6 714 29.70 14.53 28.00 29.27 13.34 0.42 80.00 79.58
## SibSp 7 891 0.52 1.10 0.00 0.27 0.00 0.00 8.00 8.00
## Parch 8 891 0.38 0.81 0.00 0.18 0.00 0.00 6.00 6.00
## Ticket* 9 891 339.52 200.83 338.00 339.65 268.35 1.00 681.00 680.00
## Fare 10 891 32.20 49.69 14.45 21.38 10.24 0.00 512.33 512.33
## Cabin* 11 891 18.63 38.14 1.00 8.29 0.00 1.00 148.00 147.00
## Embarked* 12 891 3.53 0.80 4.00 3.66 0.00 1.00 4.00 3.00
## skew kurtosis se
## PassengerId 0.00 -1.20 8.62
## Survived 0.48 -1.77 0.02
## Pclass -0.63 -1.28 0.03
## Name* 0.00 -1.20 8.62
## Sex* -0.62 -1.62 0.02
## Age 0.39 0.16 0.54
## SibSp 3.68 17.73 0.04
## Parch 2.74 9.69 0.03
## Ticket* 0.00 -1.28 6.73
## Fare 4.77 33.12 1.66
## Cabin* 2.09 3.07 1.28
## Embarked* -1.27 -0.16 0.03
Examine where are the missing values, and separate the missing values.
for (i in 1:length(df)) {
# Loop through rows
for (j in 1:nrow(df)) {
# Check for empty strings or NA values
if (df[j, i] == "" | is.na(df[j, i])) {
# Replace with actual NA value (not a string "NA")
df[j, i] <- NA
}
}
}
cols_with_nas <- sum(colSums(is.na(df)) > 0)
colSums(is.na(df))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 687 2
cat("All NAs are in",cols_with_nas,"columns ")
## All NAs are in 3 columns
cat("")
mdf <- data.frame(purrr::map_df(df, ~mean(is.na(.))))
vals <- c(mdf$Age[1],mdf$Cabin[1], mdf$Embarked[1])
labs <- c("Age", "Cabin", "Embarked")
piepercent<- round(100 * vals / sum(vals), 1)
pie(vals, labels = piepercent, main="Missing Values by Column Variable", col = rainbow(3))
legend("topright",labs, cex = 0.8, fill = rainbow(3))
missmap(df)
# Remove Unnecessary Columns
df <- select(df, -PassengerId)
df <- select(df, -Name)
df <- select(df, -Ticket)
df <- select(df, -Cabin)
# Impute Remaining. if numeric use mean. if factor use numeric
for (col in names(df)) {
if (is.numeric(df[[col]]) || is.integer(df[[col]])) {
if (sum(!is.na(df[[col]])) > 10) {
# If more than 10 non-NA values, use mean
df[[col]][is.na(df[[col]])] <- mean(df[[col]], na.rm = TRUE)
} else {
# Otherwise, use linear interpolation for imputation
df[[col]][is.na(df[[col]])] <- approx(seq_along(df[[col]]), df[[col]], n = length(df[[col]]))[["y"]][is.na(df[[col]])]
}
} else if (is.factor(df[[col]])) {
mode_val <- names(sort(-table(df[[col]])))[1]
df[[col]][is.na(df[[col]])] <- mode_val
} else if (is.character(df[[col]])) {
df[[col]][is.na(df[[col]])] <- "NA"
}
}
# Re-classify categorical numerics as Factors (e.g. Pclass, Survived)
# Because they have a fixed and known set of possible values, and they
# represent a characteristic (qual.) rather than a measureable (quant.) value
df$Survived <- cut(df$Survived, breaks=c(-1,0,1), labels=c("NO","YES"))
df$Pclass <- cut(df$Pclass, breaks=c(0,1,2,3), labels=c("First","Second","Third"))
# Right-size the levels (number of possible values) on categorical variables.
df$Survived <- as.factor(as.character(df$Survived))
df$Pclass <- as.factor(as.character(df$Pclass))
##df$Name <- as.factor(as.character(df$Name))
df$Sex <- as.factor(as.character(df$Sex))
##df$Ticket <- as.factor(as.character(df$Ticket))
##df$Cabin <- as.factor(as.character(df$Cabin))
df$Embarked <- as.factor(as.character(df$Embarked))
# Show that the dataset is now clean, with only pertinent variables (8),
# no missing values and ready for Univariate Visualization and Analysis
###########################################################
missmap(df)
na_counts_base <- colSums(is.na(df))
print(na_counts_base)
## Survived Pclass Sex Age SibSp Parch Fare Embarked
## 0 0 0 0 0 0 0 0
summary(df)
## Survived Pclass Sex Age SibSp
## NO :549 First :216 female:314 Min. : 0.42 Min. :0.000
## YES:342 Second:184 male :577 1st Qu.:22.00 1st Qu.:0.000
## Third :491 Median :29.70 Median :0.000
## Mean :29.70 Mean :0.523
## 3rd Qu.:35.00 3rd Qu.:1.000
## Max. :80.00 Max. :8.000
## Parch Fare Embarked
## Min. :0.0000 Min. : 0.00 C:168
## 1st Qu.:0.0000 1st Qu.: 7.91 Q: 77
## Median :0.0000 Median : 14.45 S:646
## Mean :0.3816 Mean : 32.20
## 3rd Qu.:0.0000 3rd Qu.: 31.00
## Max. :6.0000 Max. :512.33
psych::describe(df)
## vars n mean sd median trimmed mad min max range skew
## Survived* 1 891 1.38 0.49 1.00 1.35 0.00 1.00 2.00 1.00 0.48
## Pclass* 2 891 2.31 0.84 3.00 2.39 0.00 1.00 3.00 2.00 -0.63
## Sex* 3 891 1.65 0.48 2.00 1.68 0.00 1.00 2.00 1.00 -0.62
## Age 4 891 29.70 13.00 29.70 29.25 9.34 0.42 80.00 79.58 0.43
## SibSp 5 891 0.52 1.10 0.00 0.27 0.00 0.00 8.00 8.00 3.68
## Parch 6 891 0.38 0.81 0.00 0.18 0.00 0.00 6.00 6.00 2.74
## Fare 7 891 32.20 49.69 14.45 21.38 10.24 0.00 512.33 512.33 4.77
## Embarked* 8 891 2.54 0.79 3.00 2.67 0.00 1.00 3.00 2.00 -1.26
## kurtosis se
## Survived* -1.77 0.02
## Pclass* -1.28 0.03
## Sex* -1.62 0.02
## Age 0.95 0.44
## SibSp 17.73 0.04
## Parch 9.69 0.03
## Fare 33.12 1.66
## Embarked* -0.22 0.03
ageIQR <- paste("IQR for Age:",round(IQR(df$Age),2))
fareIQR <- paste("IQR for Fare:",round(IQR(df$Fare),2))
ageBPstats <- c(count(boxplot.stats(df$Age)$out))
fareBPstats <- c(count(boxplot.stats(df$Fare)$out))
ageOutliers <- paste("Number of Outliers for Age:",sum(ageBPstats$freq))
fareOutliers <- paste("Number of Outliers for Fare:",sum(fareBPstats$freq))
ageIQR
## [1] "IQR for Age: 13"
ageOutliers
## [1] "Number of Outliers for Age: 66"
fareIQR
## [1] "IQR for Fare: 23.09"
fareOutliers
## [1] "Number of Outliers for Fare: 116"
Generating plots of a single variables.
Multi-Plot of a single categorical/factor variable Survived
(YES=1,NO=0) .
# Create a bargraph for numeric or a histogram for categorical variable.
# Set col to the desired column name
################################################################################
col = "Survived"
if (is.factor(df[[col]])) { # if the col is categorical, then the code will
# create two graphs the Bar plot
# Highlight and run until the line that start with `# Boxplot for numeric variables
#
# If the col is numeric, then it will create the histogram
# Bar graph for factors
p1 <- ggplot(df, aes(x = .data[[col]], fill = .data[[col]])) +
geom_bar() +
labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
theme_minimal() +
theme(legend.position = "right")
} else if (is.numeric(df[[col]]) || is.integer(df[[col]])) {
# Create a
# Histogram for numeric variables
# note the the "Binwidth" cannot be set up in the same way to work with
# Age or Fare that has a small range and one that the range is in thousands
# Change this appropriately
###b_width <- mean(c(df[[col]]))
b_width <- 10
p1 <- ggplot(df, aes(x = .data[[col]])) +
geom_histogram(binwidth = b_width, fill="transparent", color="blue") +
labs(title = paste("Histogram for", col), x = col, y = "Count") +
theme_minimal()
}
## Normal Probability Plot of {col} using QQ
s_parm = c(as.numeric(df[[col]]))
parms <- df %>% summarize(mean = mean(s_parm), sd = sd(s_parm))
p2 <- df %>% ggplot(aes(sample = .data[[col]])) + geom_qq(dparams = parms) + geom_abline() + ggtitle(paste("Probability for", col))
## Dot plot of Passenger {col}
y_axis <- as.factor(nrow(df))
p3 <- ggplot(df, aes(x=.data[[col]], y=y_axis)) + geom_dotplot(binwidth=.05) + labs(title = paste("Dot Plot for", col), x = col, y = "Count") + theme(plot.title = element_text(hjust=0.5))
{
## Boxplot for numeric variables. Explain the findings of your Boxplot.
## Are there any outliers? what is the IQR?
p4 <- ggplot(df, aes(x = factor(1), y = .data[[col]])) +
geom_boxplot() +
labs(title = paste("Box Plot for", col), x = col, y = "Value") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
}
multiplot(p1,p2,p3,p4,cols = 2)
Multi-Plot of a single categorical/factor variable Pclass (First=1,Second=2,Third=2) .
# Create a bargraph for numeric or a histogram for categorical variable.
# Set col to the desired column name
################################################################################
col = "Pclass"
if (is.factor(df[[col]])) { # if the col is categorical, then the code will
# create two graphs the Bar plot
# Highlight and run until the line that start with `# Boxplot for numeric variables
#
# If the col is numeric, then it will create the histogram
# Bar graph for factors
p1 <- ggplot(df, aes(x = .data[[col]], fill = .data[[col]])) +
geom_bar() +
labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
theme_minimal() +
theme(legend.position = "right")
}
## Normal Probability Plot of {col} using QQ
s_parm = c(as.numeric(df[[col]]))
parms <- df %>% summarize(mean = mean(s_parm), sd = sd(s_parm))
p2 <- df %>% ggplot(aes(sample = .data[[col]])) + geom_qq(dparams = parms) + geom_abline() + ggtitle(paste("Probability for", col))
## Dot plot of Passenger {col}
y_axis <- as.factor(nrow(df))
p3 <- ggplot(df, aes(x=.data[[col]], y=y_axis)) + geom_dotplot(binwidth=.05) + labs(title = paste("Dot Plot for", col), x = col, y = "Count") + theme(plot.title = element_text(hjust=0.5))
{
## Boxplot for numeric variables. Explain the findings of your Boxplot.
## Are there any outliers? what is the IQR?
p4 <- ggplot(df, aes(x = factor(1), y = .data[[col]])) +
geom_boxplot() +
labs(title = paste("Box Plot for", col), x = col, y = "Value") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
}
multiplot(p1,p2,p3,p4,cols = 2)
Generating plots of a single Quantitative variables.
Multi-Plot of a single quantitative/numeric variable
Age(decimal) .
# Create a bargraph for numeric or a histogram for categorical variable.
# Set col to the desired column name
################################################################################
col = "Age"
if (is.factor(df[[col]])) { # if the col is categorical, then the code will
# create two graphs the Bar plot
# Highlight and run until the line that start with `# Boxplot for numeric variables
#
# If the col is numeric, then it will create the histogram
# Bar graph for factors
p1 <- ggplot(df, aes(x = .data[[col]], fill = .data[[col]])) +
geom_bar() +
labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
theme_minimal() +
theme(legend.position = "right")
} else if (is.numeric(df[[col]]) || is.integer(df[[col]])) {
# Create a
# Histogram for numeric variables
# note the the "Binwidth" cannot be set up in the same way to work with
# Age or Fare that has a small range and one that the range is in thousands
# Change this appropriately
###b_width <- mean(c(df[[col]]))
b_width <- 10 ## a good binwidth for age ~ 10
p1 <- ggplot(df, aes(x = .data[[col]])) +
geom_histogram(binwidth = b_width, fill="transparent", color="blue") +
labs(title = paste("Histogram for", col), x = col, y = "Count") +
theme_minimal()
}
## Normal Probability Plot of {col} using QQ
s_parm = c(as.numeric(df[[col]]))
parms <- df %>% summarize(mean = mean(s_parm), sd = sd(s_parm))
p2 <- df %>% ggplot(aes(sample = .data[[col]])) + geom_qq(dparams = parms) + geom_abline() + ggtitle(paste("Probability for", col))
## Dot plot of Passenger {col}
y_axis <- as.factor(nrow(df))
p3 <- ggplot(df, aes(x=.data[[col]], y=y_axis)) + geom_dotplot(binwidth=.5) + labs(title = paste("Dot Plot for", col), x = col, y = "Count") + theme(plot.title = element_text(hjust=0.5))
{
## Boxplot for numeric variables. Explain the findings of your Boxplot.
## Are there any outliers? what is the IQR?
p4 <- ggplot(df, aes(x = factor(1), y = .data[[col]])) +
geom_boxplot() +
labs(title = paste("Box Plot for", col), x = col, y = "Value") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
}
multiplot(p1,p2,p3,p4,cols = 2)
Generating plots of a single Quantitative variables.
Multi-Plot of a single quantitative/numeric variable
Fare(decimal).
# Create a bargraph for numeric or a histogram for categorical variable.
# Set col to the desired column name
################################################################################
col = "Fare"
if (is.factor(df[[col]])) { # if the col is categorical, then the code will
# create two graphs the Bar plot
# Highlight and run until the line that start with `# Boxplot for numeric variables
#
# If the col is numeric, then it will create the histogram
# Bar graph for factors
p1 <- ggplot(df, aes(x = .data[[col]], fill = .data[[col]])) +
geom_bar() +
labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
theme_minimal() +
theme(legend.position = "right")
} else if (is.numeric(df[[col]]) || is.integer(df[[col]])) {
# Create a
# Histogram for numeric variables
# note the the "Binwidth" cannot be set up in the same way to work with
# Age or Fare that has a small range and one that the range is in thousands
# Change this appropriately
###b_width <- mean(c(df[[col]]))
b_width <- 30 ## a good binwidth for fare ~ 30
p1 <- ggplot(df, aes(x = .data[[col]])) +
geom_histogram(binwidth = b_width, fill="transparent", color="blue") +
labs(title = paste("Histogram for", col), x = col, y = "Count") +
theme_minimal()
}
## Normal Probability Plot of {col} using QQ
s_parm = c(as.numeric(df[[col]]))
parms <- df %>% summarize(mean = mean(s_parm), sd = sd(s_parm))
p2 <- df %>% ggplot(aes(sample = .data[[col]])) + geom_qq(dparams = parms) + geom_abline() + ggtitle(paste("Probability for", col))
## Dot plot of Passenger {col}
y_axis <- as.factor(nrow(df))
p3 <- ggplot(df, aes(x=.data[[col]], y=y_axis)) + geom_dotplot(binwidth=3) + labs(title = paste("Dot Plot for", col), x = col, y = "Count") + theme(plot.title = element_text(hjust=0.5))
{
## Boxplot for numeric variables. Explain the findings of your Boxplot.
## Are there any outliers? what is the IQR?
p4 <- ggplot(df, aes(x = factor(1), y = .data[[col]])) +
geom_boxplot() +
labs(title = paste("Box Plot for", col), x = col, y = "Value") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
}
multiplot(p1,p2,p3,p4,cols = 2)
In the Titanic dataset, visualizing individual variables helps reveal key patterns, such as passenger demographics and survival rates, and highlights data issues that may affect further analysis. Here’s why univariate visualization is significant and how it aids in understanding each variable’s properties:
Understanding Distribution
Share of Distribution: Univariate visualizations such as histograms and box plots help understand the overall shape of a variable’s distribution (e.g., normal, skewed, or multimodal). For example, in the Titanic dataset, a histogram of the Fare variable shows a right-skewed distribution, indicating that most passengers paid lower fares, with a few high fares contributing to a long tail. Similarly, an Age histogram may reveal multiple peaks, indicating a multimodal distribution with clusters of children, young adults, and older passengers.
Symmetry and Normality: Visualizations like probability plots (Q-Q plots) assess whether a variable follows a normal distribution, which is crucial for certain statistical tests and modeling techniques.
Identifying Data Issues
Outliers: Box plots highlight potential outliers, which are values that deviate significantly from the rest of the data.
In the Titanic dataset, box plots for Fare often display extreme outliers due to a few high-ticket prices in first class, which can influence measures like the mean and skew the analysis. Identifying outliers is essential for deciding whether they should be retained, transformed, or removed based on their impact on analysis and modeling.
Missing Values: Visualizations can help spot gaps in data, particularly when plotting categorical variables (e.g., Embarked). Missing values may indicate data quality issues that need addressing, such as imputing missing values or removing rows.
Examining Central Tendency and Spread
Measures of Central Tendency: Histograms, bar plots, and box plots provide insights into measures like the mean, median, and mode. For example, the box plot for Age reveals the median age, which can indicate the central age group on the Titanic.
Variance and Spread: Box plots and histograms reveal the spread of the data, showing the range, interquartile range (IQR), and any concentration of values. In the case of Fare, visualizing the IQR can help understand the fare range most passengers paid and how far the highest fares deviate from the median.
Categorical Variable Analysis
Frequency Distribution: Bar plots are particularly useful for categorical variables such as Sex and Pclass. They provide an easy way to observe the frequency of each category, showing how many passengers belonged to each class or gender. For instance, the distribution of Survived values (0 for no, 1 for yes) immediately shows the survival rate, a critical insight in understanding the outcomes in the Titanic dataset.
Class Proportions: By examining the proportions within each category (e.g., the ratio of male to female passengers), bar plots can provide context for any survival analysis, particularly when survival rates are broken down by class or gender.
Basis for Feature Engineering and Futher Analysis
Transformations: Understanding the distribution helps inform whether transformations (e.g., log or square root transformations) may be beneficial, particularly for variables with skewed distributions like Fare.
Imputation Decisions: Visualizing missing or skewed data (e.g., Age) helps decide on imputation methods (e.g., mean or median imputation) and sets a clear basis for handling data gaps in a way that best preserves the variable’s characteristics.
This study conducted and performed by Will Hinton