Librerias Utilizadas
library(data.table)
library(dplyr)
library(plyr)
library(ggplot2)
library(naniar)
library(Hmisc)
library(psych)
library(tidyverse)
library(janitor)
library(knitr)
library(pollster)
library(epiDisplay)
library(descr)
library(tidyr)
library(textclean)
library(lubridate)
library(crosstable)
library(corrplot) # correlation plots
library(jtools) # presentation of regression analysis
library(lmtest) # diagnostic checks - linear regression analysis
library(car) # diagnostic checks - linear regression analysis
library(olsrr)
library(tseries) # time series analysis and computational finance
library(forecast) # provides methods and tools for displaying and analyzing univariate time series forecast
library(astsa)
Primero importamos la base de datos y realizamos una ligera limpieza para eliminar NA’s y ajustar formato de cada variable.
us_sales <- read.csv("C:\\Users\\javaw\\OneDrive - Instituto Tecnologico y de Estudios Superiores de Monterrey\\7mo Semestre\\Reto\\US vehicle sales.csv")
colnames(us_sales) <- us_sales[1, ]
us_sales <- us_sales[- 1, ]
us_sales$Year <- as.numeric(us_sales$Year)
us_sales$`Production (1000)` <- as.numeric(us_sales$`Production (1000)`)
us_sales$`Vehicle Sales (1000)` <- as.numeric(us_sales$`Vehicle Sales (1000)`)
us_sales$`Unemployment rate` <- as.numeric(us_sales$`Unemployment rate`)
us_sales$`Family Size` <- as.numeric(us_sales$`Family Size`)
us_sales$`Saving Rate` <- as.numeric(us_sales$`Saving Rate`)
## Warning: NAs introducidos por coerción
us_sales$`Gasoline price (USD per gallon)` <- as.numeric(us_sales$`Gasoline price (USD per gallon)`)
us_sales <- filter(us_sales, !is.na(us_sales$`Saving Rate`))
corrplot(cor(us_sales),type='upper',order='hclust',addCoef.col='black')
### Scatter Plots Saving Rate y Sales
ggplot(us_sales, aes(x=`Saving Rate`, y=`Vehicle Sales (1000)`)) +
geom_point(shape=19, size=3) + # Add points of the scatterplot
labs(title = "Relationship between US Saving Rate & Car Sales",
x="US Saving Rate", y="Car Sales (Thousands of Units)") +
theme_classic()
Unemployment Rate y Sales
ggplot(us_sales, aes(x=`Unemployment rate`, y=`Vehicle Sales (1000)`)) +
geom_point(shape=19, size=3) + # Add points of the scatterplot
labs(title = "Relationship between US Unemployment & Car Sales",
x="US Unemployment Rate", y="Car Sales (Thousands of Units)") +
theme_classic()
### Estimación de la regresión lineal múltiple
model1<-lm(`Vehicle Sales (1000)`~`Production (1000)`+`Unemployment rate`+`Family Size`+`Saving Rate`+`Gasoline price (USD per gallon)`,data=us_sales)
summary(model1)
##
## Call:
## lm(formula = `Vehicle Sales (1000)` ~ `Production (1000)` + `Unemployment rate` +
## `Family Size` + `Saving Rate` + `Gasoline price (USD per gallon)`,
## data = us_sales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2045.2 -371.7 119.7 483.5 1129.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.213e+04 4.193e+04 1.959 0.06783 .
## `Production (1000)` 8.228e-01 2.229e-01 3.691 0.00198 **
## `Unemployment rate` -8.351e+02 1.181e+02 -7.072 2.64e-06 ***
## `Family Size` -2.073e+04 1.330e+04 -1.558 0.13871
## `Saving Rate` 2.168e+02 7.596e+01 2.854 0.01148 *
## `Gasoline price (USD per gallon)` -3.710e+02 2.919e+02 -1.271 0.22199
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 801.6 on 16 degrees of freedom
## Multiple R-squared: 0.8821, Adjusted R-squared: 0.8453
## F-statistic: 23.94 on 5 and 16 DF, p-value: 6.685e-07
Con lo siguiente obtenemos la ecuación para predecir.
library(broom)
tidy(model1, quick=TRUE)
## # A tibble: 6 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 82128. 41933. 1.96 0.0678
## 2 `Production (1000)` 0.823 0.223 3.69 0.00198
## 3 `Unemployment rate` -835. 118. -7.07 0.00000264
## 4 `Family Size` -20729. 13302. -1.56 0.139
## 5 `Saving Rate` 217. 76.0 2.85 0.0115
## 6 `Gasoline price (USD per gallon)` -371. 292. -1.27 0.222
model2<-lm(`Production (1000)`~`Vehicle Sales (1000)`+`Unemployment rate`+`Family Size`+`Saving Rate`+`Gasoline price (USD per gallon)`,data=us_sales)
summary(model2)
##
## Call:
## lm(formula = `Production (1000)` ~ `Vehicle Sales (1000)` + `Unemployment rate` +
## `Family Size` + `Saving Rate` + `Gasoline price (USD per gallon)`,
## data = us_sales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -814.65 -413.22 -73.85 180.51 1268.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.720e+04 3.788e+04 -0.718 0.483091
## `Vehicle Sales (1000)` 5.590e-01 1.514e-01 3.691 0.001978 **
## `Unemployment rate` 4.636e+02 1.602e+02 2.894 0.010569 *
## `Family Size` 6.619e+03 1.165e+04 0.568 0.577829
## `Saving Rate` -2.227e+02 5.307e+01 -4.196 0.000684 ***
## `Gasoline price (USD per gallon)` 5.312e+01 2.521e+02 0.211 0.835786
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 660.7 on 16 degrees of freedom
## Multiple R-squared: 0.6929, Adjusted R-squared: 0.5969
## F-statistic: 7.219 on 5 and 16 DF, p-value: 0.001038
Con lo siguiente obtenemos la ecuación para predecir.
library(broom)
tidy(model2, quick=TRUE)
## # A tibble: 6 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -27197. 37878. -0.718 0.483
## 2 `Vehicle Sales (1000)` 0.559 0.151 3.69 0.00198
## 3 `Unemployment rate` 464. 160. 2.89 0.0106
## 4 `Family Size` 6619. 11650. 0.568 0.578
## 5 `Saving Rate` -223. 53.1 -4.20 0.000684
## 6 `Gasoline price (USD per gallon)` 53.1 252. 0.211 0.836
summary(autoregressive_model<-arma(us_sales$`Vehicle Sales (1000)`,order=c(1,0)))
##
## Call:
## arma(x = us_sales$`Vehicle Sales (1000)`, order = c(1, 0))
##
## Model:
## ARMA(1,0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3167.22 92.83 227.64 524.60 1227.43
##
## Coefficient(s):
## Estimate Std. Error t value Pr(>|t|)
## ar1 0.8030 0.1218 6.591 4.37e-11 ***
## intercept 2974.1621 1926.6135 1.544 0.123
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Fit:
## sigma^2 estimated as 1414639, Conditional Sum-of-Squares = 28292783, AIC = 378.01
autoregressive_model_forecast<-forecast(autoregressive_model$fitted,h=5,level=c(95))
## Warning in ets(object, lambda = lambda, biasadj = biasadj,
## allow.multiplicative.trend = allow.multiplicative.trend, : Missing values
## encountered. Using longest contiguous portion of time series
autoregressive_model_forecast
## Point Forecast Lo 95 Hi 95
## 23 14595.15 12561.27 16629.03
## 24 14595.15 11718.95 17471.34
## 25 14595.15 11072.60 18117.69
## 26 14595.15 10527.70 18662.60
## 27 14595.15 10047.62 19142.68
summary(ma_model<-arma(us_sales$`Vehicle Sales (1000)`,order=c(0,1)))
##
## Call:
## arma(x = us_sales$`Vehicle Sales (1000)`, order = c(0, 1))
##
## Model:
## ARMA(0,1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3181.4 -333.2 235.9 741.4 1149.3
##
## Coefficient(s):
## Estimate Std. Error t value Pr(>|t|)
## ma1 9.396e-01 8.420e-02 11.16 <2e-16 ***
## intercept 1.597e+04 5.422e+02 29.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Fit:
## sigma^2 estimated as 1554334, Conditional Sum-of-Squares = 31893173, AIC = 380.08
ma_model_forecast<-forecast(ma_model$fitted,h=5,level=c(95))
## Warning in ets(object, lambda = lambda, biasadj = biasadj,
## allow.multiplicative.trend = allow.multiplicative.trend, : Missing values
## encountered. Using longest contiguous portion of time series
ma_model_forecast
## Point Forecast Lo 95 Hi 95
## 23 14829.86 12830.89 16828.83
## 24 14829.86 12320.92 17338.81
## 25 14829.86 11897.29 17762.43
## 26 14829.86 11526.62 18133.10
## 27 14829.86 11192.69 18467.04