#install libraries
#install.packages("Hmisc")
library("Hmisc")
## Warning: package 'Hmisc' was built under R version 3.6.2
## Loading required package: lattice
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.6.2
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.1 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.3 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.4
## Warning: package 'tibble' was built under R version 3.6.2
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'purrr' was built under R version 3.6.2
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::src() masks Hmisc::src()
## x dplyr::summarize() masks Hmisc::summarize()
library(tigerstats)
## Warning: package 'tigerstats' was built under R version 3.6.2
## Loading required package: abd
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## Loading required package: grid
## Loading required package: mosaic
## Loading required package: ggformula
## Loading required package: ggstance
##
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
##
## geom_errorbarh, GeomErrorbarh
##
## New to ggformula? Try the tutorials:
## learnr::run_tutorial("introduction", package = "ggformula")
## learnr::run_tutorial("refining", package = "ggformula")
## Loading required package: mosaicData
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Note: If you use the Matrix package, be sure to load it BEFORE loading mosaic.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median,
## prop.test, quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
## http://homerhanumat.github.io/tigerstats
data <- read.csv("/Users/infectiousdiseases/Downloads/crewShipData.csv", header = T)
# 1 : Read the file and display columns.
#View(data)
#2 : Calculate basic statistics of the data (count, mean, std, etc) and examine data and state your observations.
##Calculate basic statistics of the data (count, mean, std, etc) and examine data and state your observations.
summary(data) # proivdes you with miniumn, maximum, meean, and IQR values for each variable/column
## Ship_name Cruise_line Age Tonnage
## Spirit : 4 Royal_Caribbean :23 Min. : 4.00 Min. : 2.329
## Legend : 3 Carnival :22 1st Qu.:10.00 1st Qu.: 46.013
## Star : 3 Princess :17 Median :14.00 Median : 71.899
## Crown : 2 Holland_American:14 Mean :15.69 Mean : 71.285
## Dawn : 2 Norwegian :13 3rd Qu.:20.00 3rd Qu.: 90.772
## Freedom: 2 Costa :11 Max. :48.00 Max. :220.000
## (Other):142 (Other) :58
## passengers length cabins passenger_density
## Min. : 0.66 Min. : 2.790 Min. : 0.330 Min. :17.70
## 1st Qu.:12.54 1st Qu.: 7.100 1st Qu.: 6.133 1st Qu.:34.57
## Median :19.50 Median : 8.555 Median : 9.570 Median :39.09
## Mean :18.46 Mean : 8.131 Mean : 8.830 Mean :39.90
## 3rd Qu.:24.84 3rd Qu.: 9.510 3rd Qu.:10.885 3rd Qu.:44.19
## Max. :54.00 Max. :11.820 Max. :27.000 Max. :71.43
##
## crew
## Min. : 0.590
## 1st Qu.: 5.480
## Median : 8.150
## Mean : 7.794
## 3rd Qu.: 9.990
## Max. :21.000
##
#basic statistics can also be collected for each variables using favstats. Giving you the min, IQR, median, max, mean, sd, number of observations and number of missing values.
favstats(data$Ship_name)
## Warning in fav_stats(x, ..., na.rm = na.rm): Auto-converting factor to
## numeric.
## min Q1 median Q3 max mean sd n missing
## 1 37.25 71.5 106.75 138 71.1962 39.60674 158 0
# this is a factor; therefore the min, IQR, sd, shouldn't be interpreted. What you can observer is there are 138 unique names for Ship names
favstats(data$Cruise_line) # this is a factor; therefore the min, IQR, sd, shouldn't be interpreted. What you can observer is there are 20 unique Cruise lines
## Warning in fav_stats(x, ..., na.rm = na.rm): Auto-converting factor to
## numeric.
## min Q1 median Q3 max mean sd n missing
## 1 4 10 15 20 9.981013 5.705592 158 0
favstats(data$Age)
## min Q1 median Q3 max mean sd n missing
## 4 10 14 20 48 15.68987 7.615691 158 0
favstats(data$Tonnage)
## min Q1 median Q3 max mean sd n missing
## 2.329 46.013 71.899 90.7725 220 71.28467 37.22954 158 0
favstats(data$passengers)
## min Q1 median Q3 max mean sd n missing
## 0.66 12.535 19.5 24.845 54 18.45741 9.677095 158 0
favstats(data$length)
## min Q1 median Q3 max mean sd n missing
## 2.79 7.1 8.555 9.51 11.82 8.130633 1.793474 158 0
favstats(data$cabins)
## min Q1 median Q3 max mean sd n missing
## 0.33 6.1325 9.57 10.885 27 8.83 4.471417 158 0
favstats(data$passenger_density)
## min Q1 median Q3 max mean sd n missing
## 17.7 34.57 39.085 44.185 71.43 39.90095 8.639217 158 0
favstats(data$crew)
## min Q1 median Q3 max mean sd n missing
## 0.59 5.48 8.15 9.99 21 7.794177 3.503487 158 0
hist(data$Age)
hist(data$Tonnage)
## Observations that can be made based on this data set is that each of the nine variables are on different scales. #For example the ages have a range of 4 to 48, while the variable Tonnage has a range of 2.329 tons to 220 tons. # This can also be seen through a distribution curb of the two variables (Age and Tonnage), and seeing that Age is more skewed to the right.
# 3 : Select columns that will be probably important to predict “crew” size.
#R base scatter plots matrices: pairs
pairs(data[,1:9], pch = 19)
# With the use of a base scatter plot, we are able to observe the variables that would be imporant predictors for “crew” size. In this case the four important predictors would be Tonnage, passengers, length, and cabins. According to this diagram these variables have a correlation with the variable crew.
#creating a correlation matrix is also an additional was to check for the correlation values for each pair of variables.
new_data <- subset(data, select = -c(1,2)) # must remove Ship_name and Cruise_line because they are facotors and not numeric, therefore correlation values can not be determined.
#I removed columns one and two (Ship_name and Cruise_line) from the matrix because they are not numerical and therefore they can not be included in the matrix for correlation.
cor(new_data)
## Age Tonnage passengers length cabins
## Age 1.0000000 -0.60664609 -0.5155423 -0.53228589 -0.5100190
## Tonnage -0.6066461 1.00000000 0.9450614 0.92236832 0.9487636
## passengers -0.5155423 0.94506140 1.0000000 0.88353479 0.9763414
## length -0.5322859 0.92236832 0.8835348 1.00000000 0.8897982
## cabins -0.5100190 0.94876357 0.9763414 0.88979821 1.0000000
## passenger_density -0.2788302 -0.04084624 -0.2948671 -0.09048847 -0.2531807
## crew -0.5306565 0.92756881 0.9152341 0.89585663 0.9508226
## passenger_density crew
## Age -0.27883020 -0.5306565
## Tonnage -0.04084624 0.9275688
## passengers -0.29486708 0.9152341
## length -0.09048847 0.8958566
## cabins -0.25318074 0.9508226
## passenger_density 1.00000000 -0.1555093
## crew -0.15550928 1.0000000
You will notice, based on the correlation matrix, that there are four important predictors for “crew” size. These variables include: Tonnage, passengers, length, and cabins. My reason for selecting these variables are because you will notice in the “crew” column there are four variables (excluding crew) that have strong correlations with “crew”. Each of these variables have a correlation that range from .89 to .95 which are highly strong correlations.
data$Ship_name <- NULL
data$passenger_density <- NULL
data$length <- NULL
I am removing Ship_name, passenger_density, and length from my dataset. - Reason for removing Ship_name is because there are were 138 unique ship names and this would lead to a very large diminsion for my one-hot encoding. - Reason for removing passenger_density and length from my dataset is because these variables have weak correlations with my outcome “crew”.
#install package and load library #install.packages(“onehot”)
#install.packages(“mltools”)
#library(dplyr) #library(stringr)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(mltools)
##
## Attaching package: 'mltools'
## The following object is masked from 'package:tidyr':
##
## replace_na
library(onehot)
#one-hot encoding for categorical features
data_onehot <- one_hot(as.data.table(data))
#6. Create training and testing sets (use 60% of the data for the training and remainder for testing).
#set seed for reproducibility
set.seed(1701)
# spliting my data by 60% for training and 40% for testing.
nrow(data_onehot)*.60
## [1] 94.8
data<- cbind(data, data_onehot)
# my training index: 95 are the number of obswervations rounded up to have 60% training.
train_index<- sample(1:nrow(data_onehot), 95, replace = F)
#Training data: use 60%
train <- data[train_index, ]
#Testing data: the remaing of the partition goes here
test <- data[-train_index,]
# 7 Build a machine learning model to predict the ‘crew’ size.
#install.packages("caret")
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:mosaic':
##
## dotPlot
## The following object is masked from 'package:purrr':
##
## lift
## The following object is masked from 'package:survival':
##
## cluster
#k-Nearest Neighbors
#model for training
model_knn_train <- train(train[, 2:5], train[, 6], method='knn')
#model for training
model_knn_test <- train(test[, 2:5], test[, 6], method='knn')