#install libraries
#install.packages("Hmisc")
library("Hmisc")
## Warning: package 'Hmisc' was built under R version 3.6.2
## Loading required package: lattice
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.6.2
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.1     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.3     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ✓ purrr   0.3.4
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x dplyr::src()       masks Hmisc::src()
## x dplyr::summarize() masks Hmisc::summarize()
library(tigerstats)
## Warning: package 'tigerstats' was built under R version 3.6.2
## Loading required package: abd
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## Loading required package: grid
## Loading required package: mosaic
## Loading required package: ggformula
## Loading required package: ggstance
## 
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh
## 
## New to ggformula?  Try the tutorials: 
##  learnr::run_tutorial("introduction", package = "ggformula")
##  learnr::run_tutorial("refining", package = "ggformula")
## Loading required package: mosaicData
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median,
##     prop.test, quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
##  http://homerhanumat.github.io/tigerstats
data <- read.csv("/Users/infectiousdiseases/Downloads/crewShipData.csv", header = T)
# 1 : Read the file and display columns.
#View(data)
#2 : Calculate basic statistics of the data (count, mean, std, etc) and examine data and state your observations.
##Calculate basic statistics of the data (count, mean, std, etc) and examine data and state your observations.

summary(data) # proivdes you with miniumn, maximum, meean, and IQR values for each variable/column
##    Ship_name             Cruise_line      Age           Tonnage       
##  Spirit :  4   Royal_Caribbean :23   Min.   : 4.00   Min.   :  2.329  
##  Legend :  3   Carnival        :22   1st Qu.:10.00   1st Qu.: 46.013  
##  Star   :  3   Princess        :17   Median :14.00   Median : 71.899  
##  Crown  :  2   Holland_American:14   Mean   :15.69   Mean   : 71.285  
##  Dawn   :  2   Norwegian       :13   3rd Qu.:20.00   3rd Qu.: 90.772  
##  Freedom:  2   Costa           :11   Max.   :48.00   Max.   :220.000  
##  (Other):142   (Other)         :58                                    
##    passengers        length           cabins       passenger_density
##  Min.   : 0.66   Min.   : 2.790   Min.   : 0.330   Min.   :17.70    
##  1st Qu.:12.54   1st Qu.: 7.100   1st Qu.: 6.133   1st Qu.:34.57    
##  Median :19.50   Median : 8.555   Median : 9.570   Median :39.09    
##  Mean   :18.46   Mean   : 8.131   Mean   : 8.830   Mean   :39.90    
##  3rd Qu.:24.84   3rd Qu.: 9.510   3rd Qu.:10.885   3rd Qu.:44.19    
##  Max.   :54.00   Max.   :11.820   Max.   :27.000   Max.   :71.43    
##                                                                     
##       crew       
##  Min.   : 0.590  
##  1st Qu.: 5.480  
##  Median : 8.150  
##  Mean   : 7.794  
##  3rd Qu.: 9.990  
##  Max.   :21.000  
## 
#basic statistics can also be collected for each variables using favstats. Giving you the min, IQR, median, max, mean, sd, number of observations and number of missing values. 
favstats(data$Ship_name) 
## Warning in fav_stats(x, ..., na.rm = na.rm): Auto-converting factor to
## numeric.
##  min    Q1 median     Q3 max    mean       sd   n missing
##    1 37.25   71.5 106.75 138 71.1962 39.60674 158       0
# this is a factor; therefore the min, IQR, sd, shouldn't be interpreted. What you can observer is there are 138 unique names for Ship names
favstats(data$Cruise_line) # this is a factor; therefore the min, IQR, sd, shouldn't be interpreted. What you can observer is there are 20 unique Cruise lines
## Warning in fav_stats(x, ..., na.rm = na.rm): Auto-converting factor to
## numeric.
##  min Q1 median Q3 max     mean       sd   n missing
##    1  4     10 15  20 9.981013 5.705592 158       0
favstats(data$Age) 
##  min Q1 median Q3 max     mean       sd   n missing
##    4 10     14 20  48 15.68987 7.615691 158       0
favstats(data$Tonnage) 
##    min     Q1 median      Q3 max     mean       sd   n missing
##  2.329 46.013 71.899 90.7725 220 71.28467 37.22954 158       0
favstats(data$passengers) 
##   min     Q1 median     Q3 max     mean       sd   n missing
##  0.66 12.535   19.5 24.845  54 18.45741 9.677095 158       0
favstats(data$length)
##   min  Q1 median   Q3   max     mean       sd   n missing
##  2.79 7.1  8.555 9.51 11.82 8.130633 1.793474 158       0
favstats(data$cabins)
##   min     Q1 median     Q3 max mean       sd   n missing
##  0.33 6.1325   9.57 10.885  27 8.83 4.471417 158       0
favstats(data$passenger_density)
##   min    Q1 median     Q3   max     mean       sd   n missing
##  17.7 34.57 39.085 44.185 71.43 39.90095 8.639217 158       0
favstats(data$crew)
##   min   Q1 median   Q3 max     mean       sd   n missing
##  0.59 5.48   8.15 9.99  21 7.794177 3.503487 158       0
hist(data$Age)

hist(data$Tonnage)

## Observations that can be made based on this data set is that each of the nine variables are on different scales. #For example the ages have a range of 4 to 48, while the variable Tonnage has a range of 2.329 tons to 220 tons. # This can also be seen through a distribution curb of the two variables (Age and Tonnage), and seeing that Age is more skewed to the right.

# 3 : Select columns that will be probably important to predict “crew” size.
#R base scatter plots matrices: pairs
pairs(data[,1:9], pch = 19)

# With the use of a base scatter plot, we are able to observe the variables that would be imporant predictors for “crew” size. In this case the four important predictors would be Tonnage, passengers, length, and cabins. According to this diagram these variables have a correlation with the variable crew.

#creating a correlation matrix is also an additional was to check for the correlation values for each pair of variables.
new_data <- subset(data, select = -c(1,2)) # must remove Ship_name and Cruise_line because they are facotors and not numeric, therefore correlation values can not be determined. 

#I removed columns one and two (Ship_name and Cruise_line) from the matrix because they are not numerical and therefore they can not be included in the matrix for correlation. 
cor(new_data)
##                          Age     Tonnage passengers      length     cabins
## Age                1.0000000 -0.60664609 -0.5155423 -0.53228589 -0.5100190
## Tonnage           -0.6066461  1.00000000  0.9450614  0.92236832  0.9487636
## passengers        -0.5155423  0.94506140  1.0000000  0.88353479  0.9763414
## length            -0.5322859  0.92236832  0.8835348  1.00000000  0.8897982
## cabins            -0.5100190  0.94876357  0.9763414  0.88979821  1.0000000
## passenger_density -0.2788302 -0.04084624 -0.2948671 -0.09048847 -0.2531807
## crew              -0.5306565  0.92756881  0.9152341  0.89585663  0.9508226
##                   passenger_density       crew
## Age                     -0.27883020 -0.5306565
## Tonnage                 -0.04084624  0.9275688
## passengers              -0.29486708  0.9152341
## length                  -0.09048847  0.8958566
## cabins                  -0.25318074  0.9508226
## passenger_density        1.00000000 -0.1555093
## crew                    -0.15550928  1.0000000

3 Answer:

You will notice, based on the correlation matrix, that there are four important predictors for “crew” size. These variables include: Tonnage, passengers, length, and cabins. My reason for selecting these variables are because you will notice in the “crew” column there are four variables (excluding crew) that have strong correlations with “crew”. Each of these variables have a correlation that range from .89 to .95 which are highly strong correlations.

4 : If you removed columns explain why you removed those.

data$Ship_name <- NULL
data$passenger_density <- NULL
data$length <- NULL

I am removing Ship_name, passenger_density, and length from my dataset. - Reason for removing Ship_name is because there are were 138 unique ship names and this would lead to a very large diminsion for my one-hot encoding. - Reason for removing passenger_density and length from my dataset is because these variables have weak correlations with my outcome “crew”.

5 Use one-hot encoding for categorical features.

#install package and load library #install.packages(“onehot”)

#install.packages(“mltools”)

#library(dplyr) #library(stringr)

ship names has too many dimension with 138 levels

library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(mltools)
## 
## Attaching package: 'mltools'
## The following object is masked from 'package:tidyr':
## 
##     replace_na
library(onehot)
#one-hot encoding for categorical features
data_onehot <- one_hot(as.data.table(data))
#6. Create training and testing sets (use 60% of the data for the training and remainder for testing).
#set seed for reproducibility
set.seed(1701)
# spliting my data by 60% for training and 40% for testing. 
nrow(data_onehot)*.60
## [1] 94.8
data<- cbind(data, data_onehot)
# my training index: 95 are the number of obswervations rounded up to have 60% training. 
train_index<- sample(1:nrow(data_onehot), 95, replace = F)
#Training data: use 60% 
train <- data[train_index, ]
#Testing data: the remaing of the partition goes here
test  <- data[-train_index,]
# 7 Build a machine learning model to predict the ‘crew’ size.
    
#install.packages("caret")
library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:mosaic':
## 
##     dotPlot
## The following object is masked from 'package:purrr':
## 
##     lift
## The following object is masked from 'package:survival':
## 
##     cluster
#k-Nearest Neighbors
#model for training
model_knn_train <- train(train[, 2:5], train[, 6], method='knn')

#model for training
model_knn_test <- train(test[, 2:5], test[, 6], method='knn')