Link to the project in RPubs: https://rpubs.com/ofomicheva86/390683
#required packages
library(MASS)
library(PerformanceAnalytics)
1.DATA EXPLORATION
The dataset contains the variables described below:
Read training and testing datasets
#read training data set
#replace blanks with NAs
data <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/titanic_train.csv",
header=T, na.strings=c("","NA"))
#read testing data set
data_testing <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/titanic_test.csv",
header=T, na.strings=c("","NA"))
#display first six entries
head(data)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## Parch Ticket Fare Cabin Embarked
## 1 0 A/5 21171 7.2500 <NA> S
## 2 0 PC 17599 71.2833 C85 C
## 3 0 STON/O2. 3101282 7.9250 <NA> S
## 4 0 113803 53.1000 C123 S
## 5 0 373450 8.0500 <NA> S
## 6 0 330877 8.4583 <NA> Q
#find dimentions
dim(data)
## [1] 891 12
levels(factor(data$Parch))
## [1] "0" "1" "2" "3" "4" "5" "6"
#required packages
library(MASS)
library(PerformanceAnalytics)
data$Fare <- data$Fare+0.01
Box = boxcox(data$Fare ~ 1, # Transform Turbidity as a single vector
lambda = seq(-10,10,0.1) # Try values -10 to 10 by 0.1
)
Cox = data.frame(Box$x, Box$y) # Create a data frame with the results
Cox2 = Cox[with(Cox, order(-Cox$Box.y)),] # Order the new data frame by decreasing y
Cox2[1,] # Display the lambda with the greatest
## Box.x Box.y
## 103 0.2 -3174.219
# log likelihood
lambda = Cox2[1, "Box.x"] # Extract that lambda
T_box = (data$Fare ^ lambda - 1)/lambda # Transform the original data
print(lambda)
## [1] 0.2
Fare_srt = sqrt(data$Fare)
Fare_cub = sign(data$Fare) * abs(data$Fare)^(1/3) # Avoid complex numbers
# for some cube roots
Fare_log = log(data$Fare)
library(rcompanion)
Fare_tuk = transformTukey(data$Fare,plotit=FALSE)
##
## lambda W Shapiro.p.value
## 410 0.225 0.8814 1.597e-25
##
## if (lambda > 0){TRANS = x ^ lambda}
## if (lambda == 0){TRANS = log(x)}
## if (lambda < 0){TRANS = -1 * x ^ lambda}
library(MASS)