Cruise Project

John Watters

remove(list=ls())
library(rmarkdown)
library(fpp2)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.4 ──
## ✔ ggplot2   3.4.0     ✔ fma       2.4  
## ✔ forecast  8.19      ✔ expsmooth 2.3
## 
library(ggplot2)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ✔ purrr   0.3.5      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(rcompanion)
## 
## Attaching package: 'rcompanion'
## 
## The following object is masked from 'package:forecast':
## 
##     accuracy
library(lubridate)
## Loading required package: timechange
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
library(reshape)
## 
## Attaching package: 'reshape'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
## 
## The following object is masked from 'package:dplyr':
## 
##     rename
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, smiths
library(gains)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(lattice)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(tibble)
library(TSstudio)

# Importing the global CO2 emissions data:

emissions <- read.csv("E:/Cruise/API_EN.ATM.CO2E.KT_DS2_en_csv_v2_3888754.csv")

# Transposing the data:

emissions <- t(emissions)
#View(emissions)

#Making first row the header with janitor package:

emissions <- janitor::row_to_names(emissions, 1)
#View(emissions)

#Extracting World data column:

globalemissions <- as.data.frame( emissions[,'World'], drop=false)
str(globalemissions)
## 'data.frame':    62 obs. of  1 variable:
##  $ emissions[, "World"]: chr  "9463838.500" "9423934.4240" "9732505.8900" "1.027450e+07" ...
#View(globalemissions)

#Renaming column with base R function:

names(globalemissions)[1] <- "World"
#View(globalemissions)
dim(globalemissions)
## [1] 62  1
str(globalemissions)
## 'data.frame':    62 obs. of  1 variable:
##  $ World: chr  "9463838.500" "9423934.4240" "9732505.8900" "1.027450e+07" ...
#Creating a date column from labels with tibble:

head(globalemissions)
##              World
## X1960  9463838.500
## X1961 9423934.4240
## X1962 9732505.8900
## X1963 1.027450e+07
## X1964 1.082025e+07
## X1965 1.140258e+07
globalemissions <- tibble::rownames_to_column(globalemissions, "Year")
head(globalemissions)
##    Year        World
## 1 X1960  9463838.500
## 2 X1961 9423934.4240
## 3 X1962 9732505.8900
## 4 X1963 1.027450e+07
## 5 X1964 1.082025e+07
## 6 X1965 1.140258e+07
#Removing the 'X' from Year column:

globalemissions$Year <- gsub("X", "", as.character(globalemissions$Year))
head(globalemissions)
##   Year        World
## 1 1960  9463838.500
## 2 1961 9423934.4240
## 3 1962 9732505.8900
## 4 1963 1.027450e+07
## 5 1964 1.082025e+07
## 6 1965 1.140258e+07
#Converting year into valid date with truncated dates with lubridate:

yrs <- c(1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970,
         1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981,
         1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 
         1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 
         2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
         2015, 2016, 2017, 2018, 2019, 2020, 2021)

lubridate::ymd(yrs, truncated = 2L)
##  [1] "1960-01-01" "1961-01-01" "1962-01-01" "1963-01-01" "1964-01-01"
##  [6] "1965-01-01" "1966-01-01" "1967-01-01" "1968-01-01" "1969-01-01"
## [11] "1970-01-01" "1971-01-01" "1972-01-01" "1973-01-01" "1974-01-01"
## [16] "1975-01-01" "1976-01-01" "1977-01-01" "1978-01-01" "1979-01-01"
## [21] "1980-01-01" "1981-01-01" "1982-01-01" "1983-01-01" "1984-01-01"
## [26] "1985-01-01" "1986-01-01" "1987-01-01" "1988-01-01" "1989-01-01"
## [31] "1990-01-01" "1991-01-01" "1992-01-01" "1993-01-01" "1994-01-01"
## [36] "1995-01-01" "1996-01-01" "1997-01-01" "1998-01-01" "1999-01-01"
## [41] "2000-01-01" "2001-01-01" "2002-01-01" "2003-01-01" "2004-01-01"
## [46] "2005-01-01" "2006-01-01" "2007-01-01" "2008-01-01" "2009-01-01"
## [51] "2010-01-01" "2011-01-01" "2012-01-01" "2013-01-01" "2014-01-01"
## [56] "2015-01-01" "2016-01-01" "2017-01-01" "2018-01-01" "2019-01-01"
## [61] "2020-01-01" "2021-01-01"
globalemissions$Year <- as.Date(as.character(yrs), format = "%Y")
head(globalemissions)
##         Year        World
## 1 1960-02-16  9463838.500
## 2 1961-02-16 9423934.4240
## 3 1962-02-16 9732505.8900
## 4 1963-02-16 1.027450e+07
## 5 1964-02-16 1.082025e+07
## 6 1965-02-16 1.140258e+07
#View(globalemissions)

#Creating a vector and time series out of the global emissions data:

emissions.ts <- matrix(globalemissions$World, nrow = 59, ncol = 1)
## Warning in matrix(globalemissions$World, nrow = 59, ncol = 1): data length [62]
## is not a sub-multiple or multiple of the number of rows [59]
emissions.ts <- as.vector(t(emissions.ts))
emissions.ts
##  [1] "9463838.500"   "9423934.4240"  "9732505.8900"  "1.027450e+07" 
##  [5] "1.082025e+07"  "1.140258e+07"  "1.198832e+07"  "1.241335e+07" 
##  [9] "13087346.5700" "1.397601e+07"  "1.527229e+07"  "15913710.780" 
## [13] "16680140.790"  "17582029.200"  "17563629.930"  "17641960.090" 
## [17] "18580873.040"  "19183226.420"  "19921141.710"  "20459807.600" 
## [21] "20396427.120"  "19879068.570"  "19815965.250"  "20034812.120" 
## [25] "20636636.760"  "21429383.160"  "21729229.480"  "22365639.630" 
## [29] "23289432.350"  "23699896.020"  "20607832.060"  "20799343.280" 
## [33] "20741660.86"   "20874105.03"   "20983787.41"   "21582843.03"  
## [37] "22039447.41"   "22394054.68"   "22509779.84"   "22615862.92"  
## [41] "23311805.66"   "23707262.47"   "24054448.1"    "25143943.9"   
## [45] "26319528.2"    "27325257.2"    "28223539.1"    "29312213.5"   
## [49] "29574920.8"    "29253757.5"    "31031799.0"    "32004027.9"   
## [53] "32444191.6"    "33053846.5"    "33085192.0"    "32943427.8"   
## [57] "32940650.1"    "33351608.1"    "34041046.0"
#Emissions by year are NOT a time series, as they are a list of data points indexed in time order.
#It should be relatively simple to display these as a line chart and analyze for trends.

emissions.ts <- ts(emissions.ts, start = c(1960), end = c(2018), frequency=1)
emissions.ts
## Time Series:
## Start = 1960 
## End = 2018 
## Frequency = 1 
##  [1]   9463838.500  9423934.4240  9732505.8900  1.027450e+07  1.082025e+07
##  [6]  1.140258e+07  1.198832e+07  1.241335e+07 13087346.5700  1.397601e+07
## [11]  1.527229e+07  15913710.780  16680140.790  17582029.200  17563629.930
## [16]  17641960.090  18580873.040  19183226.420  19921141.710  20459807.600
## [21]  20396427.120  19879068.570  19815965.250  20034812.120  20636636.760
## [26]  21429383.160  21729229.480  22365639.630  23289432.350  23699896.020
## [31]  20607832.060  20799343.280   20741660.86   20874105.03   20983787.41
## [36]   21582843.03   22039447.41   22394054.68   22509779.84   22615862.92
## [41]   23311805.66   23707262.47    24054448.1    25143943.9    26319528.2
## [46]    27325257.2    28223539.1    29312213.5    29574920.8    29253757.5
## [51]    31031799.0    32004027.9    32444191.6    33053846.5    33085192.0
## [56]    32943427.8    32940650.1    33351608.1    34041046.0
plot(ts(as.vector(emissions.ts)), type='o', xlab = 'Time (1960-2018)', ylab='Global CO2 Emissions (kt)')

mape <- function(actual,pred){
  mape <- mean(abs((actual - pred)/actual))*100
  return (mape)
}

#We can see a dip in emissions in 1990. This correlates to massive structural changes in 
#Western and European economic shifts away from manufacturing and towards service as well
#as a shift to lower coal consumption and higher gas consumption.

#Importing passenger count from 1990-2021:

passengers <- read.csv("E:/Cruise/PassengerCount.csv")
head(passengers)
##   Year Passengers.x1000.
## 1 1990              3774
## 2 1991              4168
## 3 1992              4385
## 4 1993              4800
## 5 1994              4970
## 6 1995              4721
#View(passengers)

#Transforming year into valid date column:

yr2 <- c(1990, 1991, 1992, 
         1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 
         2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
         2015, 2016, 2017, 2018, 2019, 2020, 2021)

lubridate::ymd(yr2, truncated = 2L)
##  [1] "1990-01-01" "1991-01-01" "1992-01-01" "1993-01-01" "1994-01-01"
##  [6] "1995-01-01" "1996-01-01" "1997-01-01" "1998-01-01" "1999-01-01"
## [11] "2000-01-01" "2001-01-01" "2002-01-01" "2003-01-01" "2004-01-01"
## [16] "2005-01-01" "2006-01-01" "2007-01-01" "2008-01-01" "2009-01-01"
## [21] "2010-01-01" "2011-01-01" "2012-01-01" "2013-01-01" "2014-01-01"
## [26] "2015-01-01" "2016-01-01" "2017-01-01" "2018-01-01" "2019-01-01"
## [31] "2020-01-01" "2021-01-01"
passengers$ï..Year <- as.Date(as.character(yr2), format = "%Y")
head(passengers)
##   Year Passengers.x1000.    ï..Year
## 1 1990              3774 1990-02-16
## 2 1991              4168 1991-02-16
## 3 1992              4385 1992-02-16
## 4 1993              4800 1993-02-16
## 5 1994              4970 1994-02-16
## 6 1995              4721 1995-02-16
#View(passengers)

#Renaming columns:

names(passengers)[1] <- 'Year'
head(passengers)
##   Year Passengers.x1000.    ï..Year
## 1 1990              3774 1990-02-16
## 2 1991              4168 1991-02-16
## 3 1992              4385 1992-02-16
## 4 1993              4800 1993-02-16
## 5 1994              4970 1994-02-16
## 6 1995              4721 1995-02-16
names(globalemissions)[2] <- 'WorldCO'

#Multiplying passenger count by average carbon emissions per cruise:
#Average passengers emit 0.82 tonnes of CO2 equiv per cruise. Converted to 
#KT is multiplied by .02.

passengers <- passengers %>% mutate(PassCO = ((Passengers.x1000.*1000)*.082)*.02)
#View(passengers)

#Plotting passenger emissions
plot
## function (x, y, ...) 
## UseMethod("plot")
## <bytecode: 0x000001ccfec9c3d8>
## <environment: namespace:base>
plot(passengers$Year, y = passengers$PassCO, type = "o", col = "red", xlab = "Year", ylab = "Passenger Emissions (kt)")

#Trimming year and adding dataframe columns for modeling:

globalemissions1990 <- globalemissions[-c(1:30),]
#View(globalemissions1990)

passco <- cbind(globalemissions1990, passengers)
passco <- passco[,-3]
passco <- passco[-c(30:32),]

#Plotting the two variables:

plot(passco$Year, y = passco$WorldCO, type = "o", col = "red", xlab = "Year", ylab = "World Emissions (kt)")

plot(passco$Year, y = passco$PassCO, type = "o", col = "red", xlab = "Year", ylab = "Passenger Emissions (Kt)")

#Subsetting and modeling for passenger emissions:

glimpse(passco)
## Rows: 29
## Columns: 5
## $ Year              <date> 1990-02-16, 1991-02-16, 1992-02-16, 1993-02-16, 199…
## $ WorldCO           <chr> "20607832.060", "20799343.280", "20741660.86", "2087…
## $ Passengers.x1000. <int> 3774, 4168, 4385, 4800, 4970, 4721, 5380, 5868, 5995…
## $ ï..Year           <date> 1990-02-16, 1991-02-16, 1992-02-16, 1993-02-16, 199…
## $ PassCO            <dbl> 6189.36, 6835.52, 7191.40, 7872.00, 8150.80, 7742.44…
n = nrow(passco)
split = sample(c(TRUE, FALSE), n, replace = TRUE, prob=c(0.75, 0.25))

dat_train = passco[split, ]
dat_test = passco[!split, ]
 
nrow(dat_train); nrow(dat_test)
## [1] 23
## [1] 6
#Creating a vector and time series out of the recent passenger emissions data:

dat_ts <- ts(dat_train[, 4], start = c(1990), end = c(2018), frequency = 1)
 
#MAPE function:
mape <- function(actual,pred){
  mape <- mean(abs((actual - pred)/actual))*100
  return (mape)
}

#Creating alternate TS objects for global and passenger emissions:

emissions.ts2 <- matrix(passco$WorldCO, nrow = 29, ncol = 1)
emissions.ts2 <- as.vector(t(emissions.ts2))
emissions.ts2
##  [1] "20607832.060" "20799343.280" "20741660.86"  "20874105.03"  "20983787.41" 
##  [6] "21582843.03"  "22039447.41"  "22394054.68"  "22509779.84"  "22615862.92" 
## [11] "23311805.66"  "23707262.47"  "24054448.1"   "25143943.9"   "26319528.2"  
## [16] "27325257.2"   "28223539.1"   "29312213.5"   "29574920.8"   "29253757.5"  
## [21] "31031799.0"   "32004027.9"   "32444191.6"   "33053846.5"   "33085192.0"  
## [26] "32943427.8"   "32940650.1"   "33351608.1"   "34041046.0"
emissions.ts3 <- matrix(passco$PassCO, nrow = 29, ncol = 1)
emissions.ts3 <- as.vector(t(emissions.ts3))
emissions.ts3
##  [1]  6189.36  6835.52  7191.40  7872.00  8150.80  7742.44  8823.20  9623.52
##  [9]  9831.80  9856.40 10392.68 11830.96 12298.36 14177.80 15622.64 17154.40
## [17] 18335.20 19689.84 23985.00 25877.56 28234.24 30210.44 31778.28 33349.40
## [25] 34400.64 36995.12 39651.92 41291.92 43466.56
#Emissions by year are NOT a time series, as they are a list of data points indexed in time order.
#It should be relatively simple to display these as a line chart and analyze for trends.

emissions.ts2 <- ts(emissions.ts2,frequency=1)
emissions.ts2
## Time Series:
## Start = 1 
## End = 29 
## Frequency = 1 
##  [1] 20607832.060 20799343.280  20741660.86  20874105.03  20983787.41
##  [6]  21582843.03  22039447.41  22394054.68  22509779.84  22615862.92
## [11]  23311805.66  23707262.47   24054448.1   25143943.9   26319528.2
## [16]   27325257.2   28223539.1   29312213.5   29574920.8   29253757.5
## [21]   31031799.0   32004027.9   32444191.6   33053846.5   33085192.0
## [26]   32943427.8   32940650.1   33351608.1   34041046.0
emissions.ts3 <- ts(emissions.ts3,frequency=1)
emissions.ts3
## Time Series:
## Start = 1 
## End = 29 
## Frequency = 1 
##  [1]  6189.36  6835.52  7191.40  7872.00  8150.80  7742.44  8823.20  9623.52
##  [9]  9831.80  9856.40 10392.68 11830.96 12298.36 14177.80 15622.64 17154.40
## [17] 18335.20 19689.84 23985.00 25877.56 28234.24 30210.44 31778.28 33349.40
## [25] 34400.64 36995.12 39651.92 41291.92 43466.56
#Plotting the time series for emissions:

x = ts(rnorm(29), frequency = 1, start = c(1990, 1))
y = ts(rnorm(29), frequency = 1, start = c(1990, 1))
ts.plot(emissions.ts2, emissions.ts3, gpars = list(col = c("black", "red")))

#Ab lines for simple ts:

plot(emissions.ts2)
abline(reg1<-lm(emissions.ts2~time(emissions.ts2)), col="red")

#abline(reg=lm(emissions.ts2~time(emissions.ts2)), col="red")
coef(reg1)
##         (Intercept) time(emissions.ts2) 
##          18330552.3            562494.6
#Slope:
coef(reg1)[2]
## time(emissions.ts2) 
##            562494.6
model1 <- lm(emissions.ts2~log(x))
## Warning in log(x): NaNs produced
summary(model1)
## 
## Call:
## lm(formula = emissions.ts2 ~ log(x))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -6344584 -3486693  -936222  4618708  5922434 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27403416    2150405  12.743 1.35e-06 ***
## log(x)        899477    1575730   0.571    0.584    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4913000 on 8 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.03914,    Adjusted R-squared:  -0.08097 
## F-statistic: 0.3258 on 1 and 8 DF,  p-value: 0.5838
plot(model1)

plot(emissions.ts3)
abline(reg2<-lm(emissions.ts3~time(emissions.ts3)), col="red")

#abline(reg=lm(emissions.ts3~time(emissions.ts3)), col="red")
coef(reg2)
##         (Intercept) time(emissions.ts3) 
##           -886.0403           1371.3898
#Slope:
#coef(reg)[2]

#Using forecasting for passenger emissions (non-COVID years):


holt_model <- holt(dat_ts)
summary(holt_model)
## 
## Forecast method: Holt's method
## 
## Model Information:
## Holt's method 
## 
## Call:
##  holt(y = dat_ts) 
## 
##   Smoothing parameters:
##     alpha = 0.8905 
##     beta  = 1e-04 
## 
##   Initial states:
##     l = 8118.7961 
##     b = 67.628 
## 
##   sigma:  2026.368
## 
##      AIC     AICc      BIC 
## 544.9594 547.5681 551.7959 
## 
## Error measures:
##                     ME     RMSE      MAE       MPE     MAPE     MASE
## Training set -22.53991 1881.436 743.3498 -2.057968 8.325619 0.949751
##                     ACF1
## Training set 0.003471886
## 
## Forecasts:
##      Point Forecast    Lo 80    Hi 80      Lo 95    Hi 95
## 2019       9573.011 6976.115 12169.91  5601.4018 13544.62
## 2020       9640.573 6163.021 13118.13  4322.1155 14959.03
## 2021       9708.136 5531.525 13884.75  3320.5606 16095.71
## 2022       9775.698 5001.184 14550.21  2473.7087 17077.69
## 2023       9843.261 4537.689 15148.83  1729.0882 17957.43
## 2024       9910.824 4122.611 15699.04  1058.5156 18763.13
## 2025       9978.386 3744.694 16212.08   444.7759 19512.00
## 2026      10045.949 3396.465 16695.43  -123.5598 20215.46
## 2027      10113.512 3072.662 17154.36  -654.5385 20881.56
## 2028      10181.074 2769.416 17592.73 -1154.0794 21516.23
plot(holt_model)

#Forecasting for this particular dataset is not particularly insightful.
#Exporting csv for dashboarding:

#write.csv(passco,"E:/Cruise/WorldPassEmissions.csv", row.names = FALSE)

#possible exploration for:

#Liquid fuel consumption emissions
#Emissions per person
#Emissions compared to LNG