# This is the R chunk for the required packages
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
The relevant packages were run in R markdown so that the datasets downloaded from Kaggle were able to be uploaded into R using read_csv. The datasets were then renamed to ease the codes and joined per the assignment task using left_join. Once completed we looked at the structure of our join and proceeded to factor a variable. The next step was to tidy the dataset that didn’t conform to the tidy data principles. Once the dataset was tidy a new column was created. The next step was to scan for missing values and decide which action to proceed with. Once the missing values had been sorted the next step was to look for an outliers in the data and make a decision on which method is best to handle the dataset outliers. The final step was to transform a variable. The variable chosen was highly left skewed so the appropriate steps were taken to decrease the skewness and convert the distribution into a normal distribution.
The datasets supplied for this assignment were taken from Kaggle and can be found at https://www.kaggle.com/aturate/titanic-dataset. One datasets names were flight_data and planes. Flight data had 19 variables including all necessary data fields needed for a flight to be scheduled. The planes dataset had 9 variables that explained the mechanics of a plane. The steps taken to action this data were; * Both sets were imported into R using read_csv and named flight_data and planes respectively. * The column names were shown by using the head() function. It was in this step that talinum was found to be common in both datasets. * Using talinum the datasets were joined to form a combined dataset named “a”. * To make sure the join was successful and all columns merged head() was used on the combined dataset a.
# This is the R chunk for the Data Section
planes <- read_csv("planes.csv")
## Parsed with column specification:
## cols(
## tailnum = col_character(),
## year_manu = col_double(),
## type = col_character(),
## manufacturer = col_character(),
## model = col_character(),
## engines = col_double(),
## seats = col_double(),
## speed = col_double(),
## engine = col_character()
## )
flight_data <- read_csv("flight_data.csv")
## Parsed with column specification:
## cols(
## year = col_double(),
## month = col_double(),
## day = col_double(),
## dep_time = col_double(),
## sched_dep_time = col_double(),
## dep_delay = col_double(),
## arr_time = col_double(),
## sched_arr_time = col_double(),
## arr_delay = col_double(),
## carrier = col_character(),
## flight = col_double(),
## tailnum = col_character(),
## origin = col_character(),
## dest = col_character(),
## air_time = col_double(),
## distance = col_double(),
## hour = col_double(),
## minute = col_double(),
## time_hour = col_character()
## )
head(planes)
head(flight_data)
a <- left_join(planes,flight_data, by="tailnum")
head(a)
The types of variables were discovered by using str() on the dataset. Numerical and character variables made up this dataset. The engine variable was able to be factored using factor() and then factored and ordered into five (5) levels.
# This is the R chunk for the Understand Section
str(a)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 284170 obs. of 27 variables:
## $ tailnum : chr "N10156" "N10156" "N10156" "N10156" ...
## $ year_manu : num 2004 2004 2004 2004 2004 ...
## $ type : chr "Fixed wing multi engine" "Fixed wing multi engine" "Fixed wing multi engine" "Fixed wing multi engine" ...
## $ manufacturer : chr "EMBRAER" "EMBRAER" "EMBRAER" "EMBRAER" ...
## $ model : chr "EMB-145XR" "EMB-145XR" "EMB-145XR" "EMB-145XR" ...
## $ engines : num 2 2 2 2 2 2 2 2 2 2 ...
## $ seats : num 55 55 55 55 55 55 55 55 55 55 ...
## $ speed : num NA NA NA NA NA NA NA NA NA NA ...
## $ engine : chr "Turbo-fan" "Turbo-fan" "Turbo-fan" "Turbo-fan" ...
## $ year : num 2013 2013 2013 2013 2013 ...
## $ month : num 1 1 1 1 1 1 1 1 1 1 ...
## $ day : num 10 10 10 11 11 11 12 12 13 13 ...
## $ dep_time : num 626 1120 1619 632 1116 ...
## $ sched_dep_time: num 630 1032 1540 634 1120 ...
## $ dep_delay : num -4 48 39 -2 -4 26 -8 11 15 102 ...
## $ arr_time : num 802 1320 1831 810 1328 ...
## $ sched_arr_time: num 800 1240 1744 822 1336 ...
## $ arr_delay : num 2 40 47 -12 -8 27 -20 -5 20 123 ...
## $ carrier : chr "EV" "EV" "EV" "EV" ...
## $ flight : num 4560 4269 4667 4334 4298 ...
## $ origin : chr "EWR" "EWR" "EWR" "EWR" ...
## $ dest : chr "PIT" "CHS" "MSP" "CMH" ...
## $ air_time : num 60 99 175 81 171 49 84 88 101 39 ...
## $ distance : num 319 628 1008 463 1092 ...
## $ hour : num 6 10 15 6 11 18 8 13 15 20 ...
## $ minute : num 30 32 40 34 20 19 38 59 36 39 ...
## $ time_hour : chr "10-01-2013 06:00" "10-01-2013 10:00" "10-01-2013 15:00" "11-01-2013 06:00" ...
## - attr(*, "spec")=
## .. cols(
## .. tailnum = col_character(),
## .. year_manu = col_double(),
## .. type = col_character(),
## .. manufacturer = col_character(),
## .. model = col_character(),
## .. engines = col_double(),
## .. seats = col_double(),
## .. speed = col_double(),
## .. engine = col_character()
## .. )
a$engine <- factor(a$engine)
a$engine <- factor(a$engine,
levels = c("Turbo-jet","Turbo-fan","Turbo-prop","Turbo-shaft","4 Cycle","Reciprocating"),
labels = c("Turbo-jet","Turbo-fan","Turbo-prop","Turbo-shaft","4 Cycle","Reciprocating"),
ordered = T)
The data is considered untidy data as hour and minute columns were separate for no specific reason. To fix the messy data the necessary steps were taken.
# This is the R chunk for the Tidy & Manipulate Data I
a$departure_time <- paste(a$hour,":",a$minute)
a$time_hour <- as.Date(a$time_hour)
a$time_hour <- as.Date(a$time_hour,"%d-%m-%y")
a$time_hour <- format(a$time_hour,"%d-%m-%y")
a$departure_date <- a$time_hour
c <- subset(a, select=-c(time_hour, year,month,day, hour, minute))
head(c)
Although the dataset had lots of data there was no dataset for the flight duration. This variable was created using the mutate() function and found by dep_time being taken away from arr_time and being divided by 60 to give the figure in hours.
# This is the R chunk for the Tidy & Manipulate Data II
c<-mutate(c, flight_duration = (arr_time - dep_time)/60)
# This is the R chunk for the Scan I
colSums(is.na(c))
## tailnum year_manu type manufacturer model
## 0 5306 0 0 0
## engines seats speed engine dep_time
## 0 0 283207 0 4199
## sched_dep_time dep_delay arr_time sched_arr_time arr_delay
## 0 4199 4551 0 5153
## carrier flight origin dest air_time
## 0 0 0 0 5153
## distance departure_time departure_date flight_duration
## 0 0 0 4551
sum(is.na(c))
## [1] 316319
c$speed<- impute(c$speed, fun = mean)
c$year_manu <- impute(c$year_manu, fun = mean)
c$dep_time <- impute(c$dep_time, fun = mean)
c$dep_delay <- impute(c$dep_delay, fun = mean)
c$arr_time <- impute(c$arr_time, fun = mean)
c$arr_delay <- impute(c$arr_delay, fun = mean)
c$air_time <- impute(c$air_time, fun = mean)
c$flight_duration <- impute(c$flight_duration, fun = mean)
# This is the R chunk for the Scan II
str(c)
## Classes 'tbl_df', 'tbl' and 'data.frame': 284170 obs. of 24 variables:
## $ tailnum : chr "N10156" "N10156" "N10156" "N10156" ...
## $ year_manu : 'impute' num 2004 2004 2004 2004 2004 ...
## ..- attr(*, "imputed")= int 26283 26284 26285 26286 26287 26288 26289 26290 26291 26292 ...
## $ type : chr "Fixed wing multi engine" "Fixed wing multi engine" "Fixed wing multi engine" "Fixed wing multi engine" ...
## $ manufacturer : chr "EMBRAER" "EMBRAER" "EMBRAER" "EMBRAER" ...
## $ model : chr "EMB-145XR" "EMB-145XR" "EMB-145XR" "EMB-145XR" ...
## $ engines : num 2 2 2 2 2 2 2 2 2 2 ...
## $ seats : num 55 55 55 55 55 55 55 55 55 55 ...
## $ speed : 'impute' num 151 151 151 151 151 ...
## ..- attr(*, "imputed")= int 1 2 3 4 5 6 7 8 9 10 ...
## $ engine : Ord.factor w/ 6 levels "Turbo-jet"<"Turbo-fan"<..: 2 2 2 2 2 2 2 2 2 2 ...
## $ dep_time : 'impute' num 626 1120 1619 632 1116 ...
## ..- attr(*, "imputed")= int 27 28 67 75 86 115 136 296 297 337 ...
## $ sched_dep_time : num 630 1032 1540 634 1120 ...
## $ dep_delay : 'impute' num -4 48 39 -2 -4 26 -8 11 15 102 ...
## ..- attr(*, "imputed")= int 27 28 67 75 86 115 136 296 297 337 ...
## $ arr_time : 'impute' num 802 1320 1831 810 1328 ...
## ..- attr(*, "imputed")= int 27 28 67 75 86 115 136 250 296 297 ...
## $ sched_arr_time : num 800 1240 1744 822 1336 ...
## $ arr_delay : 'impute' num 2 40 47 -12 -8 27 -20 -5 20 123 ...
## ..- attr(*, "imputed")= int 27 28 49 67 75 86 115 136 250 296 ...
## $ carrier : chr "EV" "EV" "EV" "EV" ...
## $ flight : num 4560 4269 4667 4334 4298 ...
## $ origin : chr "EWR" "EWR" "EWR" "EWR" ...
## $ dest : chr "PIT" "CHS" "MSP" "CMH" ...
## $ air_time : 'impute' num 60 99 175 81 171 49 84 88 101 39 ...
## ..- attr(*, "imputed")= int 27 28 49 67 75 86 115 136 250 296 ...
## $ distance : num 319 628 1008 463 1092 ...
## $ departure_time : chr "6 : 30" "10 : 32" "15 : 40" "6 : 34" ...
## $ departure_date : chr "20-01-10" "20-01-10" "20-01-10" "20-01-11" ...
## $ flight_duration: 'impute' num 2.93 3.33 3.53 2.97 3.53 ...
## ..- attr(*, "imputed")= int 27 28 67 75 86 115 136 250 296 297 ...
c$year_manu <- as.numeric(c$year_manu)
c$speed <- as.numeric(c$speed)
c$dep_time <- as.numeric(c$dep_time)
c$dep_delay <- as.numeric(c$dep_delay)
c$arr_time <- as.numeric(c$arr_time)
c$arr_delay <- as.numeric(c$arr_delay)
c$air_time <- as.numeric(c$air_time)
c$flight_duration <- as.numeric(c$flight_duration)
boxplot(c$year_manu)
boxplot(c$speed)
boxplot(c$dep_time)
boxplot(c$dep_delay)
boxplot(c$arr_time)
boxplot(c$arr_delay)
boxplot(c$air_time)
boxplot(c$flight_duration)
boxplot(c$engines)
boxplot(c$seats)
boxplot(c$sched_dep_time)
boxplot(c$sched_arr_time)
boxplot(c$flight)
boxplot(c$distance)
# This is the R chunk for the Transform Section
hist(c$air_time)
hist(log(c$air_time))