library(fpp2)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
## ✔ ggplot2 3.4.2 ✔ fma 2.5
## ✔ forecast 8.21 ✔ expsmooth 2.3
##
library(ggplot2)
library(ggfortify)
## Registered S3 methods overwritten by 'ggfortify':
## method from
## autoplot.Arima forecast
## autoplot.acf forecast
## autoplot.ar forecast
## autoplot.bats forecast
## autoplot.decomposed.ts forecast
## autoplot.ets forecast
## autoplot.forecast forecast
## autoplot.stl forecast
## autoplot.ts forecast
## fitted.ar forecast
## fortify.ts forecast
## residuals.ar forecast
library(forecast)
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Use the help function to explore what the series gold, woolyrnq and gas represent.
help(gold)
help(woolyrnq)
help(gas)
#Use autoplot() to plot each of these in separate plots.
autoplot(gold) + labs(title = "Gold Prices", y = "USD")
autoplot(woolyrnq) + labs(title = "Quarterly production of woollen yarn in Australia (Tonnes)", y = "Production Quantity")
autoplot(gas) + labs(title = "Australian monthly gas production: 1956–1995 ", y = "Production Quantity")
# What is the frequency of each series? Hint: apply the frequency() function.
frequency(gold)
## [1] 1
frequency(woolyrnq)
## [1] 4
frequency(gas)
## [1] 12
#Use which.max() to spot the outlier in the gold series. Which observation was it?
outlier_observation <- gold[which.max(gold)]
print(outlier_observation)
## [1] 593.7
# (a). read the data into R
retaildata <- read_excel("~/DATA 624 assignment/Predictions/retail.xlsx", skip = 1)
# (b). replace a column name
myts <- ts(retaildata[,"A3349338X"], frequency = 12, start = c(1982,2))
# (c). plot retail time series
retailts <- ts(myts, frequency = 12, start = c(1982, 4))
autoplot(retailts) + labs(title = "Retail Time Series Data", y = "Sales")
ggseasonplot(retailts) # Seasonality
ggsubseriesplot(retailts) # Cyclicity
df <- data.frame(Time = time(retailts), Value = as.numeric(retailts)) # Trends
ggplot(data = df, aes(x = Time, y = Value)) +
geom_line() +
geom_smooth(method = "loess", se = FALSE, color = "red") +
labs(title = "Trend of Retail Data", x = "Time", y = "Sales")
## Don't know how to automatically pick scale for object of type <ts>. Defaulting
## to continuous.
## `geom_smooth()` using formula = 'y ~ x'
# I can learn the nature of time series observations with different graphic tools.
#Plot time series of sales of product A
plastics_ts <- ts(plastics, frequency = 12)
autoplot(plastics_ts) + ggtitle('Monthly Sales of Product A') + ylab('Sales (in Thousands)') + xlab('Year')
#Yes. I can identify seasonal fluctuations and/or a trend-cycle in a time series plot of the scale data.
# Conduct decomposition
plastics_decomposed <- decompose(plastics_ts, type = "multiplicative")
# Print the decomposed components
print(plastics_decomposed)
## $x
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1 742 697 776 898 1030 1107 1165 1216 1208 1131 971 783
## 2 741 700 774 932 1099 1223 1290 1349 1341 1296 1066 901
## 3 896 793 885 1055 1204 1326 1303 1436 1473 1453 1170 1023
## 4 951 861 938 1109 1274 1422 1486 1555 1604 1600 1403 1209
## 5 1030 1032 1126 1285 1468 1637 1611 1608 1528 1420 1119 1013
##
## $seasonal
## Jan Feb Mar Apr May Jun Jul
## 1 0.7670466 0.7103357 0.7765294 0.9103112 1.0447386 1.1570026 1.1636317
## 2 0.7670466 0.7103357 0.7765294 0.9103112 1.0447386 1.1570026 1.1636317
## 3 0.7670466 0.7103357 0.7765294 0.9103112 1.0447386 1.1570026 1.1636317
## 4 0.7670466 0.7103357 0.7765294 0.9103112 1.0447386 1.1570026 1.1636317
## 5 0.7670466 0.7103357 0.7765294 0.9103112 1.0447386 1.1570026 1.1636317
## Aug Sep Oct Nov Dec
## 1 1.2252952 1.2313635 1.1887444 0.9919176 0.8330834
## 2 1.2252952 1.2313635 1.1887444 0.9919176 0.8330834
## 3 1.2252952 1.2313635 1.1887444 0.9919176 0.8330834
## 4 1.2252952 1.2313635 1.1887444 0.9919176 0.8330834
## 5 1.2252952 1.2313635 1.1887444 0.9919176 0.8330834
##
## $trend
## Jan Feb Mar Apr May Jun Jul
## 1 NA NA NA NA NA NA 976.9583
## 2 1000.4583 1011.2083 1022.2917 1034.7083 1045.5417 1054.4167 1065.7917
## 3 1117.3750 1121.5417 1130.6667 1142.7083 1153.5833 1163.0000 1170.3750
## 4 1208.7083 1221.2917 1231.7083 1243.2917 1259.1250 1276.5833 1287.6250
## 5 1374.7917 1382.2083 1381.2500 1370.5833 1351.2500 1331.2500 NA
## Aug Sep Oct Nov Dec
## 1 977.0417 977.0833 978.4167 982.7083 990.4167
## 2 1076.1250 1084.6250 1094.3750 1103.8750 1112.5417
## 3 1175.5000 1180.5417 1185.0000 1190.1667 1197.0833
## 4 1298.0417 1313.0000 1328.1667 1343.5833 1360.6250
## 5 NA NA NA NA NA
##
## $random
## Jan Feb Mar Apr May Jun Jul
## 1 NA NA NA NA NA NA 1.0247887
## 2 0.9656005 0.9745267 0.9750081 0.9894824 1.0061175 1.0024895 1.0401641
## 3 1.0454117 0.9953920 1.0079773 1.0142083 0.9990100 0.9854384 0.9567618
## 4 1.0257400 0.9924762 0.9807020 0.9798704 0.9684851 0.9627557 0.9917766
## 5 0.9767392 1.0510964 1.0498039 1.0299302 1.0398787 1.0628077 NA
## Aug Sep Oct Nov Dec
## 1 1.0157335 1.0040354 0.9724119 0.9961368 0.9489762
## 2 1.0230774 1.0040674 0.9962088 0.9735577 0.9721203
## 3 0.9969907 1.0132932 1.0314752 0.9910657 1.0258002
## 4 0.9776897 0.9920952 1.0133954 1.0527311 1.0665946
## 5 NA NA NA NA NA
##
## $figure
## [1] 0.7670466 0.7103357 0.7765294 0.9103112 1.0447386 1.1570026 1.1636317
## [8] 1.2252952 1.2313635 1.1887444 0.9919176 0.8330834
##
## $type
## [1] "multiplicative"
##
## attr(,"class")
## [1] "decomposed.ts"
# Plot the decomposed time series
autoplot(plastics_decomposed)
## Warning: Removed 24 rows containing missing values (`geom_line()`).
#Yes, the result the trend over time consistently increases to the 5 years and the seasonal components show regular patterns that align with your identified seasonal fluctuations.
# (d) Calculate the seasonally adjusted data
plastics_adjusted <- plastics_decomposed$x / plastics_decomposed$seasonal
# Plot the seasonally adjusted data
autoplot(plastics_adjusted) +
ggtitle("Seasonally Adjusted Sales of Product A") +
xlab("Time") +
ylab("Sales (in thousands)")
# (e). Add 500 to one observation
plastics_ts_outlier <- plastics_ts
plastics_ts_outlier[50] <- plastics_ts_outlier[50] + 500
# Decompose the series with the outlier
plastics_decomposed_outlier <- decompose(plastics_ts_outlier, type = "multiplicative")
# Compute the seasonally adjusted data with the effect of outlier
plastics_adjusted_outlier <- plastics_decomposed_outlier$x / plastics_decomposed_outlier$seasonal
# Plot the seasonally adjusted data including outliers
autoplot(plastics_adjusted_outlier) +
ggtitle("Seasonally Adjusted Sales of Product A with Outlier") +
xlab("Time") +
ylab("Sales (in thousands)")
# (f). The outlier might be different when it moves from the middle to the end of the time-series data.
# (a).transform the pigs data set into time series object
pigs_ts <- ts(pigs, frequency = 12)
fit <- ses(pigs_ts)
alpha <- fit$model$alpha
l0 <- fit$model$states[1]
forecast <- forecast(fit, h = 4)
print(alpha)
## NULL
print(l0)
## [1] 77260.06
#Generate forecasts for the next four months
plot(forecast, main = "Forecasts for Next Four Months")
# (b).Compute a 95 % prediction interval for the first forecast
# find forecast-values and residuals
forecast_values <- forecast$mean
residuals <- forecast$residuals
# calculate standard deviation of residuals
residual_sd <- sd(residuals)
# Construct the prediction intervals
forecast_intervals <- forecast_values[1]
lower_interval <- forecast_intervals - 1.96 * residual_sd
upper_interval <- forecast_intervals + 1.96 * residual_sd
# extract the 95% prediction interval for the first forecast
prediction_interval <- forecast$lower[1:2]
#Write your own function to implement simple exponential smoothing. The function should take arguments y (the time series), alpha (the smoothing parameter α) and level (the initial level ℓ0). It should return the forecast of the next observation in the series. Does it give the same forecast as ses()?
## Forecast of next observation by SES function:
## 98816.4061115907
## Forecast of next observation by ses function: 98816.4061115907
# Define the sum_squared_errors function
sum_squared_errors <- function(params, y) {
alpha <- params[1]
l0 <- params[2]
n <- length(y)
l <- numeric(n)
l[1] <- l0
# Compute the level estimate
for (i in 2:n) {
l[i] <- alpha * y[i-1] + (1 - alpha) * l[i-1]
}
# Compute the sum of squared errors
sum((y - l)^2)
}
pigs_ts <- ts(pigs, frequency = 12)
# Set the parameter bounds and initial values
bounds <- c(0, 1) # Bounds for alpha and level
initialvalue <- c(0.5, 5) # Initial values for alpha and level
# Use the optim() function to find the optimal values of alpha and level
optimum <- optim(initialvalue, sum_squared_errors, y = pigs_ts, lower = bounds[2], upper =bounds[3])$par
## Warning in optim(initialvalue, sum_squared_errors, y = pigs_ts, lower =
## bounds[2], : bounds can only be used with method L-BFGS-B (or Brent)
optimum
## [1] 1 76378
#(a).Explain the differences among these figures. Do they all indicate that the data are white noise?
#ACF for a white noise series of 360 numbers: As the larger sample size, ACF plot shows reduced variability and more accurate estimation of true correlation. it is expected to decreases significant fluctuation values very quickly and indicates the absence of any siginificant correlation.
#ACF for a white noise series of 1,000 numbers. With the largest sample sizes, the ACF plot depicts the highest accuracy estimation of the true autocorrelation.It is most likely to occur the significant values decrease more rapidly and a clearer pattern of randomness compared to the ACF for a white noices series of both 36 and 360 random numbers.
#With decreases in sample size, the critical values may be nearer to zero and have a wider range of acceptable values with lesser accuracy in forecasting the true population autocorrelation. #Conversely, as the sample size increases, the estimation of the true population autocorrelation becomes more precise. Consequently, the critical values may be farther from zero and have a narrower range of observed atutocorrelation values.
ibmclose <-ts(ibmclose)
# plot the daily closing prices for IBM stock and the ACF and PACF
autoplot(ibmclose) +
xlab("Time") +
ggtitle("ACF of IBM Stock Closing Prices")
ggAcf(ibmclose, lag.max = 50) +
xlab("Lag") +
ggtitle("ACF of IBM Stock Closing Prices")
ggPacf(ibmclose, lag.max = 50) +
xlab("Lag") +
ggtitle("PACF of IBM Stock Closing Prices")
### HA 8.6 # Use R to simulate and plot some data from simple ARIMA
models. .
# (a).Use the following R code to generate data from an AR(1) model with ϕ1=0.6 and σ2 = 1
y <- ts(numeric(100))
e <- rnorm(100)
ar1 = function(phi, y, e){
for(i in 2:100)
y[i] <- 0.6*y[i-1] + e[i]}
#b.Produce a time plot for the series. How does the plot change as you change ϕ1
?
autoplot(y)
## Help on topic 'autoplot' was found in the following packages:
##
## Package Library
## ggplot2 /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library
## forecast /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library
##
##
## Using the first match ...
#When ϕ1 is closer to 1, the series will exhibit a stronger persistence, and the values will decay more slowly over time.
#When ϕ1 is closer to 0, the series will show weaker persistence, and the values will decay more rapidly over time.
#When ϕ1 is negative, the series may exhibit oscillatory behavior depending on the magnitude of ϕ1.
# c.Write your own code to generate data from an MA(1) model with θ1 = 0.6 and σ2 = 1.
ma2 = function(theta,seed.my){
set.seed(seed.my)
y <- ts(numeric(100))
e <- rnorm(100)
for(i in 2:100){
y[i] <- theta*y[i-1] + e[i]
return(autoplot(y))}
}
#d. Produce a time plot for the series. How does the plot change as you change θ1?
ma2(.2,1)
ma2(1,1)
# e. Generate data from an ARMA(1,1) model with ϕ1=0.6,θ1=0.6,σ2=1
ma.ar = ts(numeric(50))
y2 = rnorm(50)
for(i in 2:50){
ma.ar[i] = 0.6*ma.ar[i-1] + 0.6*e[i-1] + e[i]
}
#f. Generate data from an AR(2) model with ϕ1=−0.8,ϕ2=0.3,σ2=1 (Note that these parameters will give a non-stationary series.)
ma.ar2 = ts(numeric(50))
y2 = rnorm(50)
for(i in 3:50){
ma.ar2[i] = -0.8*ma.ar2[i-1] + 0.3*ma.ar2[i-1] + e[i]
}
autoplot(ma.ar)
autoplot(ma.ar2)
### HA 8.8
###Consider austa, the total international visitors to Australia (in millions) for the period 1980-2015.
#(a). Use auto.arima() to find an appropriate ARIMA model. What model was selected. Check that the residuals look like white noise. Plot forecasts for the next 10 periods.
model <- auto.arima(austa)
model
## Series: austa
## ARIMA(0,1,1) with drift
##
## Coefficients:
## ma1 drift
## 0.3006 0.1735
## s.e. 0.1647 0.0390
##
## sigma^2 = 0.03376: log likelihood = 10.62
## AIC=-15.24 AICc=-14.46 BIC=-10.57
checkresiduals(model)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1) with drift
## Q* = 2.297, df = 6, p-value = 0.8905
##
## Model df: 1. Total lags used: 7
forecast <- forecast(model, h = 10)
autoplot(forecast, main = "Forecasts for the next 10 periods from ARIMA(0,1,1) with drift")
### 8.8b.Plot forecasts from an ARIMA(0,1,1) model with no drift and
compare these to part a. Remove the MA term and plot again.
austa011 = forecast(arima(austa, c(0,1,1)), h=10)
autoplot(austa011, main = "Forecasts for the next 10 periods from ARIMA(0,1,1)")
austa010 = forecast(arima(austa, c(0,1,0)), h=10)
autoplot(austa010, main = "Forecasts for the next 10 periods from ARIMA(0,1,0)")
### HA 8.8(c) # Plot forecasts from an ARIMA(2,1,3) model with drift.
Remove the constant and see what happens.
austa213 = forecast(Arima(austa, c(2,1,3), include.drift = TRUE), h=10)
autoplot(austa213)
### HA8.8 (d) #Plot forecasts from an ARIMA(0,0,1) model with a
constant. Remove the MA term and plot again.
austa001 = forecast(arima(austa, c(0,0,1)), include.constant=TRUE, h=10)
## Warning in forecast.Arima(arima(austa, c(0, 0, 1)), include.constant = TRUE, :
## The non-existent include.constant arguments will be ignored.
autoplot(austa001, main = "Forecasts from ARIMA(0,0, 1) with constant")
### HA 8.8(e) #Plot forecasts from an ARIMA(0,2,1) model with no
constant.
austa021 = forecast(arima(austa, c(0,2,1)), include.constant=FALSE, h=10)
## Warning in forecast.Arima(arima(austa, c(0, 2, 1)), include.constant = FALSE, :
## The non-existent include.constant arguments will be ignored.
autoplot(austa021, main = "Forecasts from ARIMA(0,2,1) no constant")
### KJ 3.1 # The UC Irvine Machine Learning Repository6 contains a data
set related to glass identification. The data consist of 214 glass
samples labeled as one of seven class categories. There are nine
predictors, including the refractive index and percentages of eight
elements: Na, Mg, Al, Si, K, Ca, Ba, and Fe.
# Load the Glass dataset from mlbench package
library(mlbench)
library(e1071)
library(moments)
##
## Attaching package: 'moments'
## The following objects are masked from 'package:e1071':
##
## kurtosis, moment, skewness
data(Glass)
# Draw histograms for predictor variables
par(mfrow = c(3, 3)) # Set the layout to 3 rows and 3 columns
hist(Glass$RI, main = "Refractive Index (RI)", xlab = "RI")
hist(Glass$Na, main = "Sodium (Na)", xlab = "Na")
hist(Glass$Mg, main = "Magnesium (Mg)", xlab = "Mg")
hist(Glass$Al, main = "Aluminum (Al)", xlab = "Al")
hist(Glass$Si, main = "Silicon (Si)", xlab = "Si")
hist(Glass$K, main = "Potassium (K)", xlab = "K")
hist(Glass$Ca, main = "Calcium (Ca)", xlab = "Ca")
hist(Glass$Ba, main = "Barium (Ba)", xlab = "Ba")
hist(Glass$Fe, main = "Iron (Fe)", xlab = "Fe")
# Explore relationships between predictors using scatter plots
pairs(Glass[, c("RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe")])
# Create boxplots for each predictor variable
boxplot(Glass$RI, main = "Refractive Index (RI)")
boxplot(Glass$Na, main = "Sodium (Na)")
boxplot(Glass$Mg, main = "Magnesium (Mg)")
boxplot(Glass$Al, main = "Aluminum (Al)")
boxplot(Glass$Si, main = "Silicon (Si)")
boxplot(Glass$K, main = "Potassium (K)")
boxplot(Glass$Ca, main = "Calcium (Ca)")
boxplot(Glass$Ba, main = "Barium (Ba)")
boxplot(Glass$Fe, main = "Iron (Fe)")
# Calculate skewness for each predictor variable
skewness <- sapply(Glass[, -10], skewness)
print(skewness)
## RI Na Mg Al Si K Ca
## 1.6140150 0.4509917 -1.1444648 0.9009179 -0.7253173 6.5056358 2.0326774
## Ba Fe
## 3.3924309 1.7420068
# Create a boxplot showing outliers
boxplot(Glass$RI, main = "Refractive Index (RI)", outline = TRUE)
# Identify outliers
outliers <- boxplot(Glass$RI, plot = FALSE)$out
# Print the outliers
print(outliers)
## [1] 1.52667 1.52320 1.51215 1.52725 1.52410 1.52475 1.53125 1.53393 1.52664
## [10] 1.52739 1.52777 1.52614 1.52369 1.51115 1.51131 1.52315 1.52365
The data distribution is left-skewed for negative skewness. Positive skewness indicates the data distribution is right-skewed. The skewness value is near zero, it means the median and the mean have the same value, hence, it’s a normal distribution.
###KJ 3.1(C) #Are there any relevant transformations of one or more predictors that might improve the classification model?
library(caret)
## Loading required package: lattice
# Transform data
transdata <- preProcess(Glass, method = c("BoxCox", "center", "scale", "pca"))
transdata
## Created from 214 samples and 10 variables
##
## Pre-processing:
## - Box-Cox transformation (5)
## - centered (9)
## - ignored (1)
## - principal component signal extraction (9)
## - scaled (9)
##
## Lambda estimates for Box-Cox transformation:
## -2, -0.1, 0.5, 2, -1.1
## PCA needed 7 components to capture 95 percent of the variance
transformed <- predict(transdata, Glass)
#The soybean data can also be found at the UC Irvine Machine Learning Repository. Data were collected to predict disease in 683 soybeans. The 35 predictors are mostly categorical and include information on the environmen-tal conditions (e.g., temperature, precipitation) and plant conditions (e.g., left spots, mold growth). The outcome labels consist of 19 distinct classes.
###KJ 3.2(a) #(a) Investigate the frequency distributions for the categorical predictors. Are any of the distributions degenerate in the ways discussed earlier in this chapter?
data("Soybean")
# Identify the categorical predictors of interest
columns <- colnames(Soybean)
# Investigate frequency distributions and create bar plots
lapply(columns,
function(col) {
ggplot(Soybean,
aes_string(col)) + geom_bar() + coord_flip() + ggtitle(col)})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
#Roughly 18 % of the data are missing. Are there particular predictors that are more likely to be missing? Is the pattern of missing data related to the classes?
# Calculate proportion of missing values for each predictor
missing_values <- colMeans(is.na(Soybean))
missing_values
## Class date plant.stand precip temp
## 0.000000000 0.001464129 0.052708638 0.055636896 0.043923865
## hail crop.hist area.dam sever seed.tmt
## 0.177159590 0.023426061 0.001464129 0.177159590 0.177159590
## germ plant.growth leaves leaf.halo leaf.marg
## 0.163982430 0.023426061 0.000000000 0.122986823 0.122986823
## leaf.size leaf.shread leaf.malf leaf.mild stem
## 0.122986823 0.146412884 0.122986823 0.158125915 0.023426061
## lodging stem.cankers canker.lesion fruiting.bodies ext.decay
## 0.177159590 0.055636896 0.055636896 0.155197657 0.055636896
## mycelium int.discolor sclerotia fruit.pods fruit.spots
## 0.055636896 0.055636896 0.055636896 0.122986823 0.155197657
## seed mold.growth seed.discolor seed.size shriveling
## 0.134699854 0.134699854 0.155197657 0.134699854 0.155197657
## roots
## 0.045387994
# Calculate missing values and the proportion of missing values
Soybean %>%
group_by(Class) %>%
summarize(Total = n(), Missing = sum(is.na(.)), Proportion = Missing / Total)
#Answer:Those 18% of missing data can be completely filled with KNN inputation methods that missing values are imputed by finding the nearest neighbors (observations with similar values) based on the available data.