.
titraina=data.frame(traina)
glimpse(titraina)
Rows: 950
Columns: 41
$ X1 <dbl> 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357...
$ iso_code <chr> "ARG", "ARG", "ARG", "ARG", "ARG", "ARG", "ARG...
$ continent <chr> "South America", "South America", "South Ameri...
$ location <chr> "Argentina", "Argentina", "Argentina", "Argent...
$ date <date> 2020-02-11, 2020-02-12, 2020-02-13, 2020-02-1...
$ total_cases <dbl> 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ new_cases <dbl> 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ new_cases_smoothed <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ total_deaths <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ new_deaths <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ new_deaths_smoothed <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ total_cases_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ new_cases_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ new_cases_smoothed_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ total_deaths_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ new_deaths_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ new_deaths_smoothed_per_million <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ new_tests <dbl> 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0...
$ total_tests <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 5, 5...
$ total_tests_per_thousand <dbl> 0, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, ...
$ new_tests_per_thousand <dbl> 0, NA, NA, NA, NA, NA, NA, NA, NA, 0, NA, NA, ...
$ new_tests_smoothed <dbl> NA, NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, ...
$ new_tests_smoothed_per_thousand <dbl> NA, NA, NA, NA, NA, NA, NA, 0, 0, 0, 0, 0, 0, ...
$ tests_per_case <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ positive_rate <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ tests_units <chr> "people tested", "people tested", "people test...
$ stringency_index <dbl> 11.11, 11.11, 11.11, 11.11, 11.11, 11.11, 11.1...
$ population <dbl> 45195777, 45195777, 45195777, 45195777, 451957...
$ population_density <dbl> 16177, 16177, 16177, 16177, 16177, 16177, 1617...
$ median_age <dbl> 31.9, 31.9, 31.9, 31.9, 31.9, 31.9, 31.9, 31.9...
$ aged_65_older <dbl> 11198, 11198, 11198, 11198, 11198, 11198, 1119...
$ aged_70_older <dbl> 7441, 7441, 7441, 7441, 7441, 7441, 7441, 7441...
$ gdp_per_capita <dbl> 18933907, 18933907, 18933907, 18933907, 189339...
$ extreme_poverty <dbl> 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0...
$ cardiovasc_death_rate <dbl> 191032, 191032, 191032, 191032, 191032, 191032...
$ diabetes_prevalence <dbl> 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5...
$ female_smokers <dbl> 16.2, 16.2, 16.2, 16.2, 16.2, 16.2, 16.2, 16.2...
$ male_smokers <dbl> 27.7, 27.7, 27.7, 27.7, 27.7, 27.7, 27.7, 27.7...
$ handwashing_facilities <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ hospital_beds_per_thousand <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5...
$ life_expectancy <dbl> 76.67, 76.67, 76.67, 76.67, 76.67, 76.67, 76.6...
se carga data set y se ve variables
unicos<-function(x)length(unique(x))
datounico <- titraina %>%
dplyr::summarise_all(list(unicos = unicos)) %>%
t()
datounico
[,1]
X1_unicos 950
iso_code_unicos 4
continent_unicos 2
location_unicos 4
date_unicos 248
total_cases_unicos 725
new_cases_unicos 665
new_cases_smoothed_unicos 723
total_deaths_unicos 655
new_deaths_unicos 398
new_deaths_smoothed_unicos 545
total_cases_per_million_unicos 746
new_cases_per_million_unicos 702
new_cases_smoothed_per_million_unicos 734
total_deaths_per_million_unicos 685
new_deaths_per_million_unicos 484
new_deaths_smoothed_per_million_unicos 586
new_tests_unicos 733
total_tests_unicos 771
total_tests_per_thousand_unicos 685
new_tests_per_thousand_unicos 461
new_tests_smoothed_unicos 706
new_tests_smoothed_per_thousand_unicos 450
tests_per_case_unicos 673
positive_rate_unicos 311
tests_units_unicos 3
stringency_index_unicos 41
population_unicos 4
population_density_unicos 4
median_age_unicos 4
aged_65_older_unicos 4
aged_70_older_unicos 4
gdp_per_capita_unicos 4
extreme_poverty_unicos 4
cardiovasc_death_rate_unicos 4
diabetes_prevalence_unicos 4
female_smokers_unicos 4
male_smokers_unicos 4
handwashing_facilities_unicos 2
hospital_beds_per_thousand_unicos 4
life_expectancy_unicos 4
sapply(titraina, function(x) sum(is.na(x)))
X1 iso_code
0 0
continent location
0 0
date total_cases
0 29
new_cases new_cases_smoothed
29 46
total_deaths new_deaths
30 30
new_deaths_smoothed total_cases_per_million
46 33
new_cases_per_million new_cases_smoothed_per_million
33 46
total_deaths_per_million new_deaths_per_million
33 33
new_deaths_smoothed_per_million new_tests
46 160
total_tests total_tests_per_thousand
158 177
new_tests_per_thousand new_tests_smoothed
183 185
new_tests_smoothed_per_thousand tests_per_case
185 259
positive_rate tests_units
207 157
stringency_index population
35 0
population_density median_age
0 0
aged_65_older aged_70_older
0 0
gdp_per_capita extreme_poverty
0 0
cardiovasc_death_rate diabetes_prevalence
0 0
female_smokers male_smokers
0 0
handwashing_facilities hospital_beds_per_thousand
702 0
life_expectancy
0
cant<-sapply(titraina, function(x) sum(is.na(x)))
sum(cant)
[1] 2842
glimpse(datofilt)
Rows: 950
Columns: 19
$ date <date> 2020-02-11, 2020-02-12, 2020-02-13, 2020-02-14, 2...
$ location <chr> "Argentina", "Argentina", "Argentina", "Argentina"...
$ total_cases <dbl> 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ new_cases <dbl> 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ total_deaths <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ new_deaths <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ population_density <dbl> 16177, 16177, 16177, 16177, 16177, 16177, 16177, 1...
$ new_tests <dbl> 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 5,...
$ total_tests <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 4, 5, 5, 10...
$ tests_per_case <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ median_age <dbl> 31.9, 31.9, 31.9, 31.9, 31.9, 31.9, 31.9, 31.9, 31...
$ aged_65_older <dbl> 11198, 11198, 11198, 11198, 11198, 11198, 11198, 1...
$ gdp_per_capita <dbl> 18933907, 18933907, 18933907, 18933907, 18933907, ...
$ extreme_poverty <dbl> 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, ...
$ cardiovasc_death_rate <dbl> 191032, 191032, 191032, 191032, 191032, 191032, 19...
$ diabetes_prevalence <dbl> 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, 5.5, ...
$ female_smokers <dbl> 16.2, 16.2, 16.2, 16.2, 16.2, 16.2, 16.2, 16.2, 16...
$ male_smokers <dbl> 27.7, 27.7, 27.7, 27.7, 27.7, 27.7, 27.7, 27.7, 27...
$ hospital_beds_per_thousand <dbl> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,...
rs<-round(cor(datofilt[,3:19],use="complete.obs"),3)
rs
total_cases new_cases total_deaths new_deaths
total_cases 1.000 0.889 0.954 0.486
new_cases 0.889 1.000 0.882 0.650
total_deaths 0.954 0.882 1.000 0.570
new_deaths 0.486 0.650 0.570 1.000
population_density 0.109 0.139 0.208 0.291
new_tests 0.928 0.919 0.933 0.519
total_tests 0.990 0.861 0.916 0.398
tests_per_case -0.070 -0.137 -0.079 -0.183
median_age 0.261 0.270 0.236 0.133
aged_65_older 0.286 0.303 0.243 0.130
gdp_per_capita 0.534 0.589 0.545 0.440
extreme_poverty 0.022 0.036 0.123 0.213
cardiovasc_death_rate 0.001 0.034 -0.028 0.003
diabetes_prevalence 0.251 0.288 0.365 0.416
female_smokers 0.478 0.551 0.429 0.327
male_smokers 0.192 0.250 0.161 0.158
hospital_beds_per_thousand -0.071 -0.066 -0.172 -0.214
population_density new_tests total_tests tests_per_case
total_cases 0.109 0.928 0.990 -0.070
new_cases 0.139 0.919 0.861 -0.137
total_deaths 0.208 0.933 0.916 -0.079
new_deaths 0.291 0.519 0.398 -0.183
population_density 1.000 0.065 0.050 -0.368
new_tests 0.065 1.000 0.914 -0.043
total_tests 0.050 0.914 1.000 -0.018
tests_per_case -0.368 -0.043 -0.018 1.000
median_age -0.675 0.347 0.298 0.457
aged_65_older -0.769 0.376 0.321 0.407
gdp_per_capita -0.353 0.633 0.535 0.281
extreme_poverty 0.983 -0.028 -0.027 -0.297
cardiovasc_death_rate 0.217 -0.037 -0.040 -0.482
diabetes_prevalence 0.898 0.240 0.200 -0.183
female_smokers -0.492 0.556 0.464 -0.039
male_smokers 0.107 0.185 0.146 -0.451
hospital_beds_per_thousand -0.619 -0.064 -0.057 -0.097
median_age aged_65_older gdp_per_capita extreme_poverty
total_cases 0.261 0.286 0.534 0.022
new_cases 0.270 0.303 0.589 0.036
total_deaths 0.236 0.243 0.545 0.123
new_deaths 0.133 0.130 0.440 0.213
population_density -0.675 -0.769 -0.353 0.983
new_tests 0.347 0.376 0.633 -0.028
total_tests 0.298 0.321 0.535 -0.027
tests_per_case 0.457 0.407 0.281 -0.297
median_age 1.000 0.975 0.887 -0.642
aged_65_older 0.975 1.000 0.867 -0.767
gdp_per_capita 0.887 0.867 1.000 -0.379
extreme_poverty -0.642 -0.767 -0.379 1.000
cardiovasc_death_rate -0.713 -0.542 -0.560 0.080
diabetes_prevalence -0.287 -0.435 0.038 0.910
female_smokers 0.443 0.613 0.580 -0.635
male_smokers -0.494 -0.302 -0.281 -0.059
hospital_beds_per_thousand -0.028 0.185 -0.153 -0.718
cardiovasc_death_rate diabetes_prevalence female_smokers
total_cases 0.001 0.251 0.478
new_cases 0.034 0.288 0.551
total_deaths -0.028 0.365 0.429
new_deaths 0.003 0.416 0.327
population_density 0.217 0.898 -0.492
new_tests -0.037 0.240 0.556
total_tests -0.040 0.200 0.464
tests_per_case -0.482 -0.183 -0.039
median_age -0.713 -0.287 0.443
aged_65_older -0.542 -0.435 0.613
gdp_per_capita -0.560 0.038 0.580
extreme_poverty 0.080 0.910 -0.635
cardiovasc_death_rate 1.000 -0.187 0.294
diabetes_prevalence -0.187 1.000 -0.446
female_smokers 0.294 -0.446 1.000
male_smokers 0.953 -0.218 0.558
hospital_beds_per_thousand 0.632 -0.858 0.644
male_smokers hospital_beds_per_thousand
total_cases 0.192 -0.071
new_cases 0.250 -0.066
total_deaths 0.161 -0.172
new_deaths 0.158 -0.214
population_density 0.107 -0.619
new_tests 0.185 -0.064
total_tests 0.146 -0.057
tests_per_case -0.451 -0.097
median_age -0.494 -0.028
aged_65_older -0.302 0.185
gdp_per_capita -0.281 -0.153
extreme_poverty -0.059 -0.718
cardiovasc_death_rate 0.953 0.632
diabetes_prevalence -0.218 -0.858
female_smokers 0.558 0.644
male_smokers 1.000 0.687
hospital_beds_per_thousand 0.687 1.000
op <− par(mfrow = c(2, 2))
ggplot(arg, aes(arg$tests_per_case , arg$new_deaths, color = "red")) +
geom_point()
ggplot(mex, aes(mex$tests_per_case , mex$new_deaths, color = "red")) +
geom_point()
ggplot(can, aes(can$tests_per_case , can$new_deaths, color = "red")) +
geom_point()
ggplot(eu, aes(eu$tests_per_case , eu$new_deaths, color = "red")) +
geom_point()
Se observa una fuerte correlacion entre total muertes, total casos y total test -mayores de 65 mayor gdp -mayor total casos mas diabetes (al igual q DBT, muerte) -mayor pobreza, mayor DBT y fumadores -mayor camas, mayor tasa de hearth y menor DBT, mayores 65 menos fumadores
new case con new deaths, esta variable se seguira investigando (test per case al aumenta disminuye levemente new deaths)
sapply(datofilt[,4:19], function(x) quantile(x))
new_cases total_deaths new_deaths population_density new_tests
0% 0.0 0.00 0.00 4037 0.00
25% 3.0 0.25 0.00 4037 41.25
50% 704.5 1740.50 23.50 35608 6849.00
75% 5774.5 13101.25 338.25 66444 35932.50
100% 78427.0 185744.00 4928.00 66444 1194086.00
total_tests tests_per_case median_age aged_65_older
0% 0.0 0.0000 29.3 6857
25% 1224.0 0.0000 29.3 6857
50% 264894.5 25.7075 38.3 15413
75% 1758582.8 9170.2500 41.4 16984
100% 83898416.0 148923.0000 41.4 16984
gdp_per_capita extreme_poverty cardiovasc_death_rate
0% 17336469 0.5 105599
25% 17336469 0.5 105599
50% 44017591 1.2 151089
75% 54225446 2.5 152783
100% 54225446 2.5 191032
diabetes_prevalence female_smokers male_smokers
0% 5.50 6.9 16.6
25% 7.37 6.9 16.6
50% 10.79 12.0 21.4
75% 13.06 19.1 24.6
100% 13.06 19.1 27.7
hospital_beds_per_thousand
0% 1.38
25% 1.38
50% 2.50
75% 2.77
100% 5.00
datofilt2 %>%
group_by(location) %>%
summarise(mean = mean(new_deaths),
primerq = quantile(new_deaths, 0.25),
mediana = median(new_deaths),
tercerq = quantile(new_deaths, 0.75),
max = max(new_deaths)
)
`summarise()` ungrouping output (override with `.groups` argument)
summary(modtest)
Call:
lm(formula = new_deaths ~ new_tests, data = datofilt2)
Residuals:
Min 1Q Median 3Q Max
-1296.0 -162.2 -158.2 -16.2 4544.9
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.622e+02 1.516e+01 10.69 <2e-16 ***
new_tests 1.261e-03 6.201e-05 20.34 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 429.4 on 948 degrees of freedom
Multiple R-squared: 0.3037, Adjusted R-squared: 0.303
F-statistic: 413.6 on 1 and 948 DF, p-value: < 2.2e-16
summary(modpr4)
Call:
lm(formula = new_deaths ~ tests_per_case, data = mex)
Residuals:
Min 1Q Median 3Q Max
-383.31 -268.25 -82.41 264.59 727.81
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 383.424306 23.657936 16.207 < 2e-16 ***
tests_per_case -0.008011 0.002514 -3.187 0.00171 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 294.1 on 171 degrees of freedom
(75 observations deleted due to missingness)
Multiple R-squared: 0.05606, Adjusted R-squared: 0.05054
F-statistic: 10.16 on 1 and 171 DF, p-value: 0.00171
modcase1<-lm(new_deaths~new_cases, data=arg)#Adjusted R-squared: 0.7633
modtest1<-lm(new_deaths~new_tests, data=arg)#Adjusted R-squared:0.6969
modpr1<-lm(new_deaths~tests_per_case, data=arg)#R-squared: 0.08562
modcase2<-lm(new_deaths~new_cases, data=can)#Adjusted R-squared: 0.6533
modtest2<-lm(new_deaths~new_tests, data=can)#Adjusted R-squared:0.6969
modpr2<-lm(new_deaths~tests_per_case, data=can)#R-squared: 0.1725
modcase3<-lm(new_deaths~new_cases, data=eu)#Adjusted R-squared: 0.2661
modtest3<-lm(new_deaths~new_tests, data=eu)#Adjusted R-squared:-0.003952
modpr3<-lm(new_deaths~tests_per_case, data=eu)#R-squared: 0.09157
modcase4<-lm(new_deaths~new_cases, data=mex)#Adjusted R-squared: 0.8574
modtest4<-lm(new_deaths~new_tests, data=mex)#Adjusted R-squared:0.5993
modpr4<-lm(new_deaths~tests_per_case, data=mex)#R-squared: 0.05054
summary(modpr4)
1-Regresi?n Lineal M?ltiple a- Modelo para predecir precio con todas las covariables
Series temporales evaluamos autocorrelacion
op <- par(mfrow = c(2, 2))
op
$mfrow
[1] 1 1
ccf(x,y,lag.max = 50)
ccf(x1,y1,lag.max = 50)
ccf(x2,y2,lag.max = 50)
ccf(x3,y3,lag.max = 50)
Ahora probando ST
glimpse(shopping)
Rows: 248
Columns: 2
$ date <date> 2019-12-31, 2020-01-01, 2020-01-02, 2020-01-0...
$ new_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
my3 <- ts (x3,frequency = 1)
fit1 <- tslm(my3 ~ trend )
myy3 <- ts (y3,frequency = 1)
fit2 <- tslm(myy3 ~ trend )
op <- par(mfrow = c(1, 2))
op
$mfrow
[1] 1 1
plot(forecast(fit1, h=40))
plot(forecast(fit2, h=40))
#eu
tiempo2=eu["date"]
#x = runif(N,min = -2,max = 2)
y2=eu["new_deaths"]
y2[is.na(y2)] <- 0
x2=eu["new_cases"]
x2[is.na(x2)] <- 0
my3 <- ts (x3,frequency = 1)
fit1 <- tslm(my3 ~ trend )
myy3 <- ts (y3,frequency = 1)
fit2 <- tslm(myy3 ~ trend )
op <- par(mfrow = c(1, 2))
op
plot(forecast(fit1, h=40))
plot(forecast(fit2, h=40))