Link to the project in RPubs: https://rpubs.com/ofomicheva86/390683

#required packages
library(MASS)
library(PerformanceAnalytics)

1.DATA EXPLORATION

The dataset contains the variables described below:

  1. ‘Survived’- survival (0 = No, 1 = Yes)
  2. ‘Pclass’ - ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
  3. ‘Sex’ - sex
  4. ‘Age’ - age in years
  5. ‘Sibsp’ - number of siblings / spouses aboard the Titanic
  6. ‘Parch’ - number of parents / children aboard the Titanic
  7. ‘Ticket’ - ticket number
  8. ‘Fare’ - passenger fare
  9. ‘Cabin’ - cabin number
    10.‘Embarked’ - port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

Read training and testing datasets

#read training data set
#replace blanks with NAs
data <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/titanic_train.csv",
header=T, na.strings=c("","NA"))

#read testing data set
data_testing <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/titanic_test.csv",
header=T, na.strings=c("","NA"))

#display first six entries
head(data)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp
## 1                             Braund, Mr. Owen Harris   male  22     1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1
## 3                              Heikkinen, Miss. Laina female  26     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1
## 5                            Allen, Mr. William Henry   male  35     0
## 6                                    Moran, Mr. James   male  NA     0
##   Parch           Ticket    Fare Cabin Embarked
## 1     0        A/5 21171  7.2500  <NA>        S
## 2     0         PC 17599 71.2833   C85        C
## 3     0 STON/O2. 3101282  7.9250  <NA>        S
## 4     0           113803 53.1000  C123        S
## 5     0           373450  8.0500  <NA>        S
## 6     0           330877  8.4583  <NA>        Q
#find dimentions
dim(data)
## [1] 891  12
levels(factor(data$Parch))
## [1] "0" "1" "2" "3" "4" "5" "6"
#required packages
library(MASS)
library(PerformanceAnalytics)
data$Fare <- data$Fare+0.01

Box = boxcox(data$Fare ~ 1,              # Transform Turbidity as a single vector
             lambda = seq(-10,10,0.1)    # Try values -10 to 10 by 0.1
             )

Cox = data.frame(Box$x, Box$y)            # Create a data frame with the results
Cox2 = Cox[with(Cox, order(-Cox$Box.y)),] # Order the new data frame by decreasing y

Cox2[1,]                                  # Display the lambda with the greatest
##     Box.x     Box.y
## 103   0.2 -3174.219
                                          #    log likelihood
lambda = Cox2[1, "Box.x"]                 # Extract that lambda
T_box = (data$Fare ^ lambda - 1)/lambda   # Transform the original data

print(lambda)
## [1] 0.2
Fare_srt = sqrt(data$Fare)

Fare_cub = sign(data$Fare) * abs(data$Fare)^(1/3)   # Avoid complex numbers 
                                                    # for some cube roots


Fare_log = log(data$Fare)


library(rcompanion)
Fare_tuk = transformTukey(data$Fare,plotit=FALSE)
## 
##     lambda      W Shapiro.p.value
## 410  0.225 0.8814       1.597e-25
## 
## if (lambda >  0){TRANS = x ^ lambda} 
## if (lambda == 0){TRANS = log(x)} 
## if (lambda <  0){TRANS = -1 * x ^ lambda}
library(MASS)