knitr::opts_chunk$set(warning = FALSE)
Crear un nuevo dataframe que sea un subconjunto del dataframe original de dfFires. El subconjunto debe contener todos los incendios del Estado de Idaho y las columnas deben ser limitadas para que sólo estén presentes las columnas YEAR_, CAUSE y TOTALACRES. Cambie el nombre de las columnas. Agrupe los datos por CAUSE y YEAR_ y luego resuma por el total de acres quemados. Trazar los resultados.
library(readr)
dfFires <- read_csv("C:/BK/Julian Acevedo/WFM_2021-11-08/WFM nov.2021/Analitica/U.NORTE/Vizualizacion datos R y Python/RDataSets/StudyArea.csv", col_types = list(UNIT = col_character()), col_names = TRUE)
knitr::kable(head(dfFires, 5))
| FID | ORGANIZATI | UNIT | SUBUNIT | SUBUNIT2 | FIRENAME | CAUSE | YEAR_ | STARTDATED | CONTRDATED | OUTDATED | STATE | STATE_FIPS | TOTALACRES |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | FWS | 81682 | USCADBR | San Diego Bay National Wildlife Refuge | PUMP HOUSE | Human | 2001 | 1/1/01 0:00 | 1/1/01 0:00 | NA | California | 6 | 0.1 |
| 1 | FWS | 81682 | USCADBR | San Diego Bay National Wildlife Refuge | I5 | Human | 2002 | 5/3/02 0:00 | 5/3/02 0:00 | NA | California | 6 | 3.0 |
| 2 | FWS | 81682 | USCADBR | San Diego Bay National Wildlife Refuge | SOUTHBAY | Human | 2002 | 6/1/02 0:00 | 6/1/02 0:00 | NA | California | 6 | 0.5 |
| 3 | FWS | 81682 | USCADBR | San Diego Bay National Wildlife Refuge | MARINA | Human | 2001 | 7/12/01 0:00 | 7/12/01 0:00 | NA | California | 6 | 0.1 |
| 4 | FWS | 81682 | USCADBR | San Diego Bay National Wildlife Refuge | HILL | Human | 1994 | 9/13/94 0:00 | 9/13/94 0:00 | NA | California | 6 | 1.0 |
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dfFires_subset<-dfFires %>%
filter(STATE == 'Idaho') %>%
select("YR" = "YEAR_", "CS" = "CAUSE","ACRES" = "TOTALACRES")
knitr::kable(head(dfFires_subset, 5))
| YR | CS | ACRES |
|---|---|---|
| 1987 | Human | 5 |
| 1991 | Natural | 150 |
| 1991 | Human | 800 |
| 1990 | Natural | 2 |
| 1985 | Human | 38 |
dfFires_summary<-dfFires_subset %>%
group_by(CS,YR) %>%
summarise(Total_Acres_Quemados=sum(ACRES))
## `summarise()` has grouped output by 'CS'. You can override using the `.groups`
## argument.
knitr::kable(head(dfFires_summary, 5))
| CS | YR | Total_Acres_Quemados |
|---|---|---|
| Human | 1980 | 71974.7 |
| Human | 1981 | 219362.4 |
| Human | 1982 | 34016.2 |
| Human | 1983 | 48242.5 |
| Human | 1984 | 36837.8 |
# Plot the results
library(ggplot2)
ggplot(dfFires_summary, aes(x = YR, y = Total_Acres_Quemados, color = CS)) +
geom_line() +
theme_minimal() +
labs(title = 'Total acres quemados por causa y aƱo en Idaho',
x = 'AƱo',
y = 'Total Acres Quemados') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Resuelva la Tarea 1.1 de la Sección 1 de Python utilizando R.
Ejercicio para entregar Trabajaremos con el conjunto de datos de 120 aƱos de historia olĆmpica adquirido por Randi Griffin en Randi-Griffin y puesto a disposición en athlete_events. - Su tarea consiste en identificar los cinco deportes mĆ”s importantes segĆŗn el mayor nĆŗmero de medallas otorgadas en el aƱo 2016, y luego realizar el siguiente anĆ”lisis:
athlete_events_url<-'https://raw.githubusercontent.com/lihkirun/AppliedStatisticMS/main/DataVisualizationRPython/Lectures/Python/PythonDataSets/athlete_events.csv'
library(readr)
df_athlete_events <- read_csv(athlete_events_url, col_types = list(UNIT = col_character()), col_names = TRUE)
knitr::kable(head(df_athlete_events, 5))
| ID | Name | Sex | Age | Height | Weight | Team | NOC | Games | Year | Season | City | Sport | Event | Medal |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | A Dijiang | M | 24 | 180 | 80 | China | CHN | 1992 Summer | 1992 | Summer | Barcelona | Basketball | Basketball Menās Basketball | NA |
| 2 | A Lamusi | M | 23 | 170 | 60 | China | CHN | 2012 Summer | 2012 | Summer | London | Judo | Judo Menās Extra-Lightweight | NA |
| 3 | Gunnar Nielsen Aaby | M | 24 | NA | NA | Denmark | DEN | 1920 Summer | 1920 | Summer | Antwerpen | Football | Football Menās Football | NA |
| 4 | Edgar Lindenau Aabye | M | 34 | NA | NA | Denmark/Sweden | DEN | 1900 Summer | 1900 | Summer | Paris | Tug-Of-War | Tug-Of-War Menās Tug-Of-War | Gold |
| 5 | Christine Jacoba Aaftink | F | 21 | 185 | 82 | Netherlands | NED | 1988 Winter | 1988 | Winter | Calgary | Speed Skating | Speed Skating Womenās 500 metres | NA |
Genere un grÔfico que indique el número de medallas concedidas en cada uno de los cinco principales deportes en 2016.
medallas_2016 <- df_athlete_events %>%
filter(Year == 2016 & !is.na(Medal)) %>%
group_by(Sport) %>%
summarise(Medals = n()) %>%
arrange(desc(Medals)) %>%
head(5)
ggplot(medallas_2016, aes(x = reorder(Sport, Medals), y = Medals, fill = Sport)) +
geom_bar(stat = "identity") +
labs(title = "NĆŗmero de medallas concedidas en cada uno de los cinco principales deportes en 2016",
x = "Deporte",
y = "Numero de Medallas") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Trace un grÔfico que represente la distribución de la edad de los ganadores de medallas en los cinco principales deportes en 2016.
medallas_2016 <- df_athlete_events %>%
filter(Year == 2016, !is.na(Medal)) %>%
group_by(Sport) %>%
summarise(TotalMedals = n()) %>%
top_n(5, TotalMedals) %>%
pull(Sport)
datos_top_sports <- df_athlete_events %>%
filter(Year == 2016, Sport %in% medallas_2016, !is.na(Medal), !is.na(Age))
ggplot(datos_top_sports, aes(x = Age, fill = Sport)) +
geom_histogram(binwidth = 1, position = "dodge") +
facet_wrap(~Sport) +
labs(title = "Distribución de la edad de los ganadores de medallas en los cinco principales deportes en 2016",
x = "Edad",
y = "Conteo ganadores") +
theme_minimal()
Descubre qué equipos nacionales ganaron el mayor número de medallas en los cinco principales deportes en 2016.
medallas_2016 <- df_athlete_events %>%
filter(Year == 2016, !is.na(Medal))
top_sports <- medallas_2016 %>%
count(Sport) %>%
top_n(5, n) %>%
pull(Sport)
top_sports_medals <- medallas_2016 %>%
filter(Sport %in% top_sports)
top_teams_medals <- top_sports_medals %>%
group_by(NOC, Sport) %>%
summarise(Medals = n(), .groups = 'drop') %>%
arrange(desc(Medals))
top_teams_medals
## # A tibble: 94 Ć 3
## NOC Sport Medals
## <chr> <chr> <int>
## 1 USA Swimming 71
## 2 USA Athletics 46
## 3 GER Football 35
## 4 GER Hockey 33
## 5 AUS Swimming 32
## 6 JAM Athletics 30
## 7 GBR Rowing 26
## 8 ARG Hockey 18
## 9 CAN Football 18
## 10 NGR Football 18
## # ā¹ 84 more rows
Observe la tendencia del peso medio de los atletas masculinos y femeninos ganadores en los cinco principales deportes en 2016
medallas_2016 <- df_athlete_events %>%
filter(Year == 2016, !is.na(Medal))
top_sports <- medallas_2016 %>%
count(Sport) %>%
top_n(5, n) %>%
pull(Sport)
datos_top_sports <- medallas_2016 %>%
filter(Sport %in% top_sports)
weight_trends <- datos_top_sports %>%
group_by(Sport, Sex) %>%
summarise(MeanWeight = mean(Weight, na.rm = TRUE), .groups = 'drop')
ggplot(weight_trends, aes(x = Sport, y = MeanWeight, fill = Sex)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(title = "peso medio de los atletas masculinos y femeninos ganadores en los cinco principales deportes en 2016",
x = "Deporte",
y = "Peso Medio(kg)") +
scale_fill_manual(values = c("#CDC8B1", "#8B8878")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Considere el conjunto de datos us_state_population.tsv utilizado en la sección de Python, para la creación del mapa coroplĆ©tico de Estados Unidos. Repita el procedimiento planteado en cada Ćtem de esta sección para obtener el nuevo dataframe con las nuevas columnas Year y Population. Realice unión y separación utilizando las columnas State y Code.
us_state_population_url='https://raw.githubusercontent.com/lihkir/Uninorte/main/AppliedStatisticMS/DataVisualizationRPython/Lectures/Python/PythonDataSets/us_state_population.tsv'
df_us_state_population<-read_tsv(us_state_population_url )
## Rows: 51 Columns: 11
## āā Column specification āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
## Delimiter: "\t"
## chr (2): State, Code
## dbl (9): 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018
##
## ā¹ Use `spec()` to retrieve the full column specification for this data.
## ā¹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
knitr::kable(head(df_us_state_population, 5))
| State | Code | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 |
|---|---|---|---|---|---|---|---|---|---|---|
| Alabama | AL | 4785448 | 4798834 | 4815564 | 4830460 | 4842481 | 4853160 | 4864745 | 4875120 | 4887871 |
| Alaska | AK | 713906 | 722038 | 730399 | 737045 | 736307 | 737547 | 741504 | 739786 | 737438 |
| Arizona | AZ | 6407774 | 6473497 | 6556629 | 6634999 | 6733840 | 6833596 | 6945452 | 7048876 | 7171646 |
| Arkansas | AR | 2921978 | 2940407 | 2952109 | 2959549 | 2967726 | 2978407 | 2990410 | 3002997 | 3013825 |
| California | CA | 37320903 | 37641823 | 37960782 | 38280824 | 38625139 | 38953142 | 39209127 | 39399349 | 39557045 |
library(readr)
library(tidyr)
df_us_state_population_2 = gather(df_us_state_population,
"2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018",
key = "YEAR",
value = "POPULATION")
knitr::kable(head(df_us_state_population_2, 20))
| State | Code | YEAR | POPULATION |
|---|---|---|---|
| Alabama | AL | 2010 | 4785448 |
| Alaska | AK | 2010 | 713906 |
| Arizona | AZ | 2010 | 6407774 |
| Arkansas | AR | 2010 | 2921978 |
| California | CA | 2010 | 37320903 |
| Colorado | CO | 2010 | 5048281 |
| Connecticut | CT | 2010 | 3579125 |
| Delaware | DE | 2010 | 899595 |
| District of Columbia | DC | 2010 | 605085 |
| Florida | FL | 2010 | 18845785 |
| Georgia | GA | 2010 | 9711810 |
| Hawaii | HI | 2010 | 1363963 |
| Idaho | ID | 2010 | 1570773 |
| Illinois | IL | 2010 | 12840762 |
| Indiana | IN | 2010 | 6490436 |
| Iowa | IA | 2010 | 3050767 |
| Kansas | KS | 2010 | 2858213 |
| Kentucky | KY | 2010 | 4348200 |
| Louisiana | LA | 2010 | 4544532 |
| Maine | ME | 2010 | 1327632 |
df_us_state_population_2b = spread(df_us_state_population_2, key = YEAR, value = POPULATION)
knitr::kable(head(df_us_state_population_2b))
| State | Code | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 |
|---|---|---|---|---|---|---|---|---|---|---|
| Alabama | AL | 4785448 | 4798834 | 4815564 | 4830460 | 4842481 | 4853160 | 4864745 | 4875120 | 4887871 |
| Alaska | AK | 713906 | 722038 | 730399 | 737045 | 736307 | 737547 | 741504 | 739786 | 737438 |
| Arizona | AZ | 6407774 | 6473497 | 6556629 | 6634999 | 6733840 | 6833596 | 6945452 | 7048876 | 7171646 |
| Arkansas | AR | 2921978 | 2940407 | 2952109 | 2959549 | 2967726 | 2978407 | 2990410 | 3002997 | 3013825 |
| California | CA | 37320903 | 37641823 | 37960782 | 38280824 | 38625139 | 38953142 | 39209127 | 39399349 | 39557045 |
| Colorado | CO | 5048281 | 5121771 | 5193721 | 5270482 | 5351218 | 5452107 | 5540921 | 5615902 | 5695564 |
df_us_state_population_3 = unite(df_us_state_population, State_Code, State, Code)
knitr::kable(head(df_us_state_population_3, 10))
| State_Code | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 |
|---|---|---|---|---|---|---|---|---|---|
| Alabama_AL | 4785448 | 4798834 | 4815564 | 4830460 | 4842481 | 4853160 | 4864745 | 4875120 | 4887871 |
| Alaska_AK | 713906 | 722038 | 730399 | 737045 | 736307 | 737547 | 741504 | 739786 | 737438 |
| Arizona_AZ | 6407774 | 6473497 | 6556629 | 6634999 | 6733840 | 6833596 | 6945452 | 7048876 | 7171646 |
| Arkansas_AR | 2921978 | 2940407 | 2952109 | 2959549 | 2967726 | 2978407 | 2990410 | 3002997 | 3013825 |
| California_CA | 37320903 | 37641823 | 37960782 | 38280824 | 38625139 | 38953142 | 39209127 | 39399349 | 39557045 |
| Colorado_CO | 5048281 | 5121771 | 5193721 | 5270482 | 5351218 | 5452107 | 5540921 | 5615902 | 5695564 |
| Connecticut_CT | 3579125 | 3588023 | 3594395 | 3594915 | 3594783 | 3587509 | 3578674 | 3573880 | 3572665 |
| Delaware_DE | 899595 | 907316 | 915188 | 923638 | 932596 | 941413 | 949216 | 957078 | 967171 |
| District of Columbia_DC | 605085 | 619602 | 634725 | 650431 | 662513 | 675254 | 686575 | 695691 | 702455 |
| Florida_FL | 18845785 | 19093352 | 19326230 | 19563166 | 19860330 | 20224249 | 20629982 | 20976812 | 21299325 |
df_us_state_population_3b = separate(df_us_state_population_3, "State_Code",into = c("State", "Code"))
knitr::kable(head(df_us_state_population_3b, 10))
| State | Code | 2010 | 2011 | 2012 | 2013 | 2014 | 2015 | 2016 | 2017 | 2018 |
|---|---|---|---|---|---|---|---|---|---|---|
| Alabama | AL | 4785448 | 4798834 | 4815564 | 4830460 | 4842481 | 4853160 | 4864745 | 4875120 | 4887871 |
| Alaska | AK | 713906 | 722038 | 730399 | 737045 | 736307 | 737547 | 741504 | 739786 | 737438 |
| Arizona | AZ | 6407774 | 6473497 | 6556629 | 6634999 | 6733840 | 6833596 | 6945452 | 7048876 | 7171646 |
| Arkansas | AR | 2921978 | 2940407 | 2952109 | 2959549 | 2967726 | 2978407 | 2990410 | 3002997 | 3013825 |
| California | CA | 37320903 | 37641823 | 37960782 | 38280824 | 38625139 | 38953142 | 39209127 | 39399349 | 39557045 |
| Colorado | CO | 5048281 | 5121771 | 5193721 | 5270482 | 5351218 | 5452107 | 5540921 | 5615902 | 5695564 |
| Connecticut | CT | 3579125 | 3588023 | 3594395 | 3594915 | 3594783 | 3587509 | 3578674 | 3573880 | 3572665 |
| Delaware | DE | 899595 | 907316 | 915188 | 923638 | 932596 | 941413 | 949216 | 957078 | 967171 |
| District | of | 605085 | 619602 | 634725 | 650431 | 662513 | 675254 | 686575 | 695691 | 702455 |
| Florida | FL | 18845785 | 19093352 | 19326230 | 19563166 | 19860330 | 20224249 | 20629982 | 20976812 | 21299325 |
Utilice lo aprendido en esta sección para realizar una predicción a 7 dĆas, del valor de las acciones de Tecnoglass. Esto es. Debe realizar:
url<-'https://raw.githubusercontent.com/lihkir/Data/main/TGLS.csv'
df_TGLS<-read.csv(url)
df_TGLS$Date <- as.Date(df_TGLS$Date, format="%Y-%m-%d")
#df_TGLS$Date<-as.Date(df_TGLS$Dat,format="%Y-%m-%d")
#df_TGLS$Date <- as.POSIXct(df_TGLS$Date, format="%Y-%m-%d")
#df_TGLS<-df_TGLS[order(df_TGLS$Date),]
knitr::kable(head(df_TGLS, 5))
| Date | Open | High | Low | Close | Adj.Close | Volume |
|---|---|---|---|---|---|---|
| 2012-05-10 | 9.97 | 10.00 | 9.50 | 9.80 | 7.945550 | 6900 |
| 2012-05-11 | 9.70 | 9.70 | 9.70 | 9.70 | 7.864471 | 300 |
| 2012-05-14 | 9.80 | 9.80 | 9.80 | 9.80 | 7.945550 | 100 |
| 2012-05-15 | 9.75 | 9.75 | 9.75 | 9.75 | 7.905012 | 300 |
| 2012-05-16 | 9.75 | 9.75 | 9.75 | 9.75 | 7.905012 | 0 |
str(df_TGLS)
## 'data.frame': 2200 obs. of 7 variables:
## $ Date : Date, format: "2012-05-10" "2012-05-11" ...
## $ Open : num 9.97 9.7 9.8 9.75 9.75 9.6 9.6 9.6 9.69 9.77 ...
## $ High : num 10 9.7 9.8 9.75 9.75 9.6 9.6 9.6 9.69 9.77 ...
## $ Low : num 9.5 9.7 9.8 9.75 9.75 9.6 9.6 9.6 9.69 9.6 ...
## $ Close : num 9.8 9.7 9.8 9.75 9.75 9.6 9.6 9.6 9.69 9.6 ...
## $ Adj.Close: num 7.95 7.86 7.95 7.91 7.91 ...
## $ Volume : int 6900 300 100 300 0 800 0 0 1500 153600 ...
AnĆ”lisis Exploratorio de Datos: ACF, PAC, Descomposición, Dicky Fuller, Boxplots, Medidas de tendencia Central y Variavilidad, etc,ā¦
library(TSstudio)
library(xts)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
#ts_TGLS <- ts(df_TGLS$Close, frequency=1, start=c(2012, 5))
ts_TGLS <- ts(df_TGLS$Close, frequency=1)
ts_plot(ts_TGLS,
title='Acciones de Tecnoglass',
Ytitle ='Acciones',
Xtitle='AƱo')
Como se logra identificar en el grƔfico, las acciones de Tecnoglas tienen una tendencia aparentemente negativa, sin embargo los ultimos registros tienen una tendencia positiva. Es evidente que el comportamiento no es estacinario.
summary(ts_TGLS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.290 7.960 9.840 9.477 10.910 15.060
Con este resumen podemos identificar que la distribucion tiene un ligero sesgo hacia la izquierda.
hist(ts_TGLS,main = "Histograma Acciones Tecnoglass", xlab = "Acciones", breaks = "Sturges", probability = TRUE)
lines(density(ts_TGLS))
## Verificar datos faltantes
sum(is.na(ts_TGLS))
## [1] 0
boxplot(ts_TGLS~cycle(ts_TGLS),xlab="Date", ylab = "y" ,main ="Boxplot")
# Descomposición
#ts_TGLS_2 <- ts(df_TGLS$Close, frequency=365, start=c(2012, 5))
ts_TGLS_2 <- ts(df_TGLS$Close,frequency=365)
library(TSstudio)
ts_decompose(ts_TGLS_2)
acf(ts_TGLS)
El grÔfico ACF nos muestra que la correlación de la serie con sus
rezagos decae en el tiempo de forma lineal, por lo tanto la eliminación
de la tendencia de la serie y la correlación entre la serie y sus
rezagos puede hacerse mediante diferenciación.
ts_TGLS_d1<-diff(ts_TGLS)
par(mfrow=c(1,2))
acf(ts_TGLS_d1)
pacf(ts_TGLS_d1)
asado en los graficos de ACF y PACF es apropiado aplicar la primera diferenciación AR(1)
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
adf.test(ts_TGLS)
##
## Augmented Dickey-Fuller Test
##
## data: ts_TGLS
## Dickey-Fuller = -2.6642, Lag order = 13, p-value = 0.2971
## alternative hypothesis: stationary
El resultado de la prueba Dicky-FUller nos indica que las acciones de Tecnoglass no son estacionarias, el \(p-value=0.2971\) es superior al valor de significancia del \(0.05\), por lo tanto no se rechaza la \(H_{0}\) que sugiere estacionalidad.
ts_plot(diff(ts_TGLS, lag=1),
title='Acciones de Tecnoglass - 1° diferenciación',
Ytitle ='Acciones',
Xtitle='AƱo')
Aplicando la primera diferenciación se logra identificar como se eliminna la tendencia de la serie y la media en general es constante.
Dibujar predicción dinÔmicas, realizar verificación de supuestos y cÔlcular métricas de precisión usando auto.arima y ademÔs, seleccionar el mejor modelo ARIMA de forma manual usando loop for.
ts_TGLS_split <- ts_split(ts_TGLS, sample.out = 7)
train<-ts_TGLS_split$train
test<-ts_TGLS_split$test
library(forecast)
TGLS_auto_md<-auto.arima(train)
TGLS_auto_md
## Series: train
## ARIMA(0,1,1)
##
## Coefficients:
## ma1
## -0.0659
## s.e. 0.0209
##
## sigma^2 = 0.04214: log likelihood = 360.94
## AIC=-717.87 AICc=-717.87 BIC=-706.49
El primer moelo auto.arima ejecutado seleccion el modelo (0,1,1) con un AIC = 648.56. No obstante se ajustara algunos argumentos con el objetivo de establecer una busqueda mas robusta y exhaustiva del modelo.
TGLS_auto_md2<-auto.arima(train,
max.order = 5,
D = 1,
d = 1,
stepwise = FALSE,
approximation = FALSE)
TGLS_auto_md2
## Series: train
## ARIMA(1,1,0)
##
## Coefficients:
## ar1
## -0.0689
## s.e. 0.0213
##
## sigma^2 = 0.04213: log likelihood = 361.17
## AIC=-718.34 AICc=-718.33 BIC=-706.95
Aplicando valores maximos en 5 para p+q+P+Q, buscando las conbinaciones posibles (stepwise = FALSE) y obtener cÔlculos mas precisos en los criterios de información (approximation = FALSE); encontramos que el mejor modelo seria el (1,1,0) con un AIC = 718.34.
TGLS_test_fc <- forecast(TGLS_auto_md2, h = 7)
accuracy(TGLS_test_fc, test)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.001345070 0.2051663 0.1210993 -0.05572084 1.437541 0.9969568
## Test set -0.005341618 0.1133806 0.0845230 -0.10243307 1.211630 0.6958402
## ACF1 Theil's U
## Training set 0.001493314 NA
## Test set 0.369024896 0.9383515
test_forecast(ts_TGLS_2,
forecast.obj = TGLS_test_fc,
test = test)
USgas_split <- ts_split(USgas, sample.out = 12)
train <- USgas_split$train
test <- USgas_split$test
library(forecast)
USgas_auto_md <- auto.arima(train)
USgas_auto_md
## Series: train
## ARIMA(2,1,1)(2,1,1)[12]
##
## Coefficients:
## ar1 ar2 ma1 sar1 sar2 sma1
## 0.4301 -0.0372 -0.9098 0.0117 -0.2673 -0.7431
## s.e. 0.0794 0.0741 0.0452 0.0887 0.0830 0.0751
##
## sigma^2 = 10446: log likelihood = -1292.83
## AIC=2599.67 AICc=2600.22 BIC=2623.2
USgas_auto_md2 <- auto.arima(train,
max.order = 5,
D = 1,
d = 1,
stepwise = FALSE,
approximation = FALSE)
USgas_auto_md2
## Series: train
## ARIMA(1,1,1)(2,1,1)[12]
##
## Coefficients:
## ar1 ma1 sar1 sar2 sma1
## 0.4247 -0.9180 0.0132 -0.2639 -0.7449
## s.e. 0.0770 0.0376 0.0894 0.0834 0.0753
##
## sigma^2 = 10405: log likelihood = -1292.96
## AIC=2597.91 AICc=2598.32 BIC=2618.08
USgas_test_fc <- forecast(USgas_auto_md2, h = 12)
accuracy(USgas_test_fc, test)
## ME RMSE MAE MPE MAPE MASE
## Training set 6.081099 97.85701 73.36854 0.1298714 3.517097 0.6371821
## Test set 42.211253 104.79281 83.09943 1.4913412 3.314280 0.7216918
## ACF1 Theil's U
## Training set 0.004565602 NA
## Test set -0.049999868 0.3469228
test_forecast(USgas,
forecast.obj = USgas_test_fc,
test = test)