#install.packages("ggplot2")
#install.packages("gridExtra")
#install.packages("cowplot")
#install.packages("patchwork")
library(psych)
## Warning: package 'psych' was built under R version 4.5.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
##
## Adjuntando el paquete: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(gridExtra)
library(cowplot)
## Warning: package 'cowplot' was built under R version 4.5.3
library(patchwork)
## Warning: package 'patchwork' was built under R version 4.5.3
##
## Adjuntando el paquete: 'patchwork'
## The following object is masked from 'package:cowplot':
##
## align_plots
Usaremos el set de datos states.x77 del paquete base “datasets” que R carga al iniciar el programa. Pueden obtener los datos usando el código: Datos = state.x77
Datos = state.x77
Datos
## Population Income Illiteracy Life Exp Murder HS Grad Frost
## Alabama 3615 3624 2.1 69.05 15.1 41.3 20
## Alaska 365 6315 1.5 69.31 11.3 66.7 152
## Arizona 2212 4530 1.8 70.55 7.8 58.1 15
## Arkansas 2110 3378 1.9 70.66 10.1 39.9 65
## California 21198 5114 1.1 71.71 10.3 62.6 20
## Colorado 2541 4884 0.7 72.06 6.8 63.9 166
## Connecticut 3100 5348 1.1 72.48 3.1 56.0 139
## Delaware 579 4809 0.9 70.06 6.2 54.6 103
## Florida 8277 4815 1.3 70.66 10.7 52.6 11
## Georgia 4931 4091 2.0 68.54 13.9 40.6 60
## Hawaii 868 4963 1.9 73.60 6.2 61.9 0
## Idaho 813 4119 0.6 71.87 5.3 59.5 126
## Illinois 11197 5107 0.9 70.14 10.3 52.6 127
## Indiana 5313 4458 0.7 70.88 7.1 52.9 122
## Iowa 2861 4628 0.5 72.56 2.3 59.0 140
## Kansas 2280 4669 0.6 72.58 4.5 59.9 114
## Kentucky 3387 3712 1.6 70.10 10.6 38.5 95
## Louisiana 3806 3545 2.8 68.76 13.2 42.2 12
## Maine 1058 3694 0.7 70.39 2.7 54.7 161
## Maryland 4122 5299 0.9 70.22 8.5 52.3 101
## Massachusetts 5814 4755 1.1 71.83 3.3 58.5 103
## Michigan 9111 4751 0.9 70.63 11.1 52.8 125
## Minnesota 3921 4675 0.6 72.96 2.3 57.6 160
## Mississippi 2341 3098 2.4 68.09 12.5 41.0 50
## Missouri 4767 4254 0.8 70.69 9.3 48.8 108
## Montana 746 4347 0.6 70.56 5.0 59.2 155
## Nebraska 1544 4508 0.6 72.60 2.9 59.3 139
## Nevada 590 5149 0.5 69.03 11.5 65.2 188
## New Hampshire 812 4281 0.7 71.23 3.3 57.6 174
## New Jersey 7333 5237 1.1 70.93 5.2 52.5 115
## New Mexico 1144 3601 2.2 70.32 9.7 55.2 120
## New York 18076 4903 1.4 70.55 10.9 52.7 82
## North Carolina 5441 3875 1.8 69.21 11.1 38.5 80
## North Dakota 637 5087 0.8 72.78 1.4 50.3 186
## Ohio 10735 4561 0.8 70.82 7.4 53.2 124
## Oklahoma 2715 3983 1.1 71.42 6.4 51.6 82
## Oregon 2284 4660 0.6 72.13 4.2 60.0 44
## Pennsylvania 11860 4449 1.0 70.43 6.1 50.2 126
## Rhode Island 931 4558 1.3 71.90 2.4 46.4 127
## South Carolina 2816 3635 2.3 67.96 11.6 37.8 65
## South Dakota 681 4167 0.5 72.08 1.7 53.3 172
## Tennessee 4173 3821 1.7 70.11 11.0 41.8 70
## Texas 12237 4188 2.2 70.90 12.2 47.4 35
## Utah 1203 4022 0.6 72.90 4.5 67.3 137
## Vermont 472 3907 0.6 71.64 5.5 57.1 168
## Virginia 4981 4701 1.4 70.08 9.5 47.8 85
## Washington 3559 4864 0.6 71.72 4.3 63.5 32
## West Virginia 1799 3617 1.4 69.48 6.7 41.6 100
## Wisconsin 4589 4468 0.7 72.48 3.0 54.5 149
## Wyoming 376 4566 0.6 70.29 6.9 62.9 173
## Area
## Alabama 50708
## Alaska 566432
## Arizona 113417
## Arkansas 51945
## California 156361
## Colorado 103766
## Connecticut 4862
## Delaware 1982
## Florida 54090
## Georgia 58073
## Hawaii 6425
## Idaho 82677
## Illinois 55748
## Indiana 36097
## Iowa 55941
## Kansas 81787
## Kentucky 39650
## Louisiana 44930
## Maine 30920
## Maryland 9891
## Massachusetts 7826
## Michigan 56817
## Minnesota 79289
## Mississippi 47296
## Missouri 68995
## Montana 145587
## Nebraska 76483
## Nevada 109889
## New Hampshire 9027
## New Jersey 7521
## New Mexico 121412
## New York 47831
## North Carolina 48798
## North Dakota 69273
## Ohio 40975
## Oklahoma 68782
## Oregon 96184
## Pennsylvania 44966
## Rhode Island 1049
## South Carolina 30225
## South Dakota 75955
## Tennessee 41328
## Texas 262134
## Utah 82096
## Vermont 9267
## Virginia 39780
## Washington 66570
## West Virginia 24070
## Wisconsin 54464
## Wyoming 97203
Datos1=as.data.frame(Datos)
Objetivo
Estudiaremos el efecto de las variables regresoras o predictoras;
“Population” e “Illiteracy” sobre la variable de respuesta “Murder”.
str(Datos1)
## 'data.frame': 50 obs. of 8 variables:
## $ Population: num 3615 365 2212 2110 21198 ...
## $ Income : num 3624 6315 4530 3378 5114 ...
## $ Illiteracy: num 2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
## $ Life Exp : num 69 69.3 70.5 70.7 71.7 ...
## $ Murder : num 15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
## $ HS Grad : num 41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
## $ Frost : num 20 152 15 65 20 166 139 103 11 60 ...
## $ Area : num 50708 566432 113417 51945 156361 ...
summary(Datos)
## Population Income Illiteracy Life Exp
## Min. : 365 Min. :3098 Min. :0.500 Min. :67.96
## 1st Qu.: 1080 1st Qu.:3993 1st Qu.:0.625 1st Qu.:70.12
## Median : 2838 Median :4519 Median :0.950 Median :70.67
## Mean : 4246 Mean :4436 Mean :1.170 Mean :70.88
## 3rd Qu.: 4968 3rd Qu.:4814 3rd Qu.:1.575 3rd Qu.:71.89
## Max. :21198 Max. :6315 Max. :2.800 Max. :73.60
## Murder HS Grad Frost Area
## Min. : 1.400 Min. :37.80 Min. : 0.00 Min. : 1049
## 1st Qu.: 4.350 1st Qu.:48.05 1st Qu.: 66.25 1st Qu.: 36985
## Median : 6.850 Median :53.25 Median :114.50 Median : 54277
## Mean : 7.378 Mean :53.11 Mean :104.46 Mean : 70736
## 3rd Qu.:10.675 3rd Qu.:59.15 3rd Qu.:139.75 3rd Qu.: 81163
## Max. :15.100 Max. :67.30 Max. :188.00 Max. :566432
class(Datos1)
## [1] "data.frame"
Datos1=as.data.frame(Datos)
Murder ~ Population
## Murder ~ Population
par(mfrow = c(1, 1))
plot(Datos1$Murder,Datos1$Population,pch = 19, col = "lightblue")
abline(lm(Datos1$Population ~ Datos1$Murder), col = "red", lwd = 3)
text(Datos1$Murder,Datos1$Population,labels=Datos1$new, cex=0.7)
Murder ~ Illiteracy
## Murder ~ Illiteracy
plot(Datos1$Murder,Datos1$Illiteracy,pch = 19, col = "lightblue")
abline(lm(Datos1$Illiteracy ~ Datos1$Murder), col = "red", lwd = 3)
text(Datos1$Murder,Datos1$Illiteracy,labels=Datos1$new, cex=0.7)
Murder ~ Illiteracy
MD=lm(Datos1$Murder ~ Datos1$Illiteracy, Data=Datos1)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'Data' will be disregarded
summary(MD)
##
## Call:
## lm(formula = Datos1$Murder ~ Datos1$Illiteracy, Data = Datos1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5315 -2.0602 -0.2503 1.6916 6.9745
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.3968 0.8184 2.928 0.0052 **
## Datos1$Illiteracy 4.2575 0.6217 6.848 1.26e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.653 on 48 degrees of freedom
## Multiple R-squared: 0.4942, Adjusted R-squared: 0.4836
## F-statistic: 46.89 on 1 and 48 DF, p-value: 1.258e-08
Murder ~ Population
MD1=lm(Datos1$Murder ~ Datos1$Population, Data=Datos1)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'Data' will be disregarded
summary(MD1)
##
## Call:
## lm(formula = Datos1$Murder ~ Datos1$Population, Data = Datos1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9855 -3.0119 -0.3128 2.4986 7.9014
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.1713934 0.6869410 8.984 7.49e-12 ***
## Datos1$Population 0.0002841 0.0001121 2.535 0.0146 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.503 on 48 degrees of freedom
## Multiple R-squared: 0.1181, Adjusted R-squared: 0.09972
## F-statistic: 6.427 on 1 and 48 DF, p-value: 0.01455
corPlot(Datos1, cex = 1.2)
model = lm(Datos1$Illiteracy ~ Datos1$Murder, data=Datos1)
par(mfrow = c(2, 2))
plot(model)
Esta variable suele mostrar una correlación más clara y fuerte con la tasa de asesinatos que la población. Históricamente, en estos datos de 1977, el analfabetismo funciona como un excelente indicador del entorno socioeconómico.
model1 = lm(Datos1$Population ~ Datos1$Murder, data=Datos1)
par(mfrow = c(2, 2))
plot(model1)
Comportamiento: Generalmente, existe una relación positiva, pero no siempre es lineal. A menudo, el aumento de asesinatos no es proporcional al aumento de personas de forma simple; los estados extremadamente grandes pueden actuar como puntos de influencia (Leverage) que mueven la línea de regresión de manera desproporcionada.
Observación técnica: En este set de datos, la población por sí sola suele ser un predictor “ruidoso”. Es posible que los residuos muestren que el modelo predice peor en estados muy poblados que en los pequeños.
modelo_pop_quad <- lm(Murder ~ Population + I(Population^2), data = Datos1)
summary(modelo_pop_quad)
##
## Call:
## lm(formula = Murder ~ Population + I(Population^2), data = Datos1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.300 -3.017 0.178 1.951 7.643
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.495e+00 9.262e-01 5.933 3.4e-07 ***
## Population 6.113e-04 3.211e-04 1.904 0.0631 .
## I(Population^2) -1.896e-08 1.745e-08 -1.087 0.2826
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.496 on 47 degrees of freedom
## Multiple R-squared: 0.1397, Adjusted R-squared: 0.1031
## F-statistic: 3.816 on 2 and 47 DF, p-value: 0.02912
modelo_ill_quad <- lm(Murder ~ Illiteracy + I(Illiteracy^2), data = Datos1)
summary(modelo_ill_quad)
##
## Call:
## lm(formula = Murder ~ Illiteracy + I(Illiteracy^2), data = Datos1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7211 -2.0173 -0.2176 1.7202 7.1698
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.6627 1.9198 0.866 0.3909
## Illiteracy 5.5642 3.1486 1.767 0.0837 .
## I(Illiteracy^2) -0.4586 1.0830 -0.424 0.6739
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.676 on 47 degrees of freedom
## Multiple R-squared: 0.4961, Adjusted R-squared: 0.4747
## F-statistic: 23.14 on 2 and 47 DF, p-value: 1.012e-07
p1=ggplot(Datos1, aes(x = Population, y = Murder)) +
geom_point() +
geom_smooth(method = "lm", color = "blue", se = FALSE) + # Línea recta
geom_smooth(method = "lm", formula = y ~ x + I(x^2), color = "red", se = FALSE) + # Curva
labs(title = "Ajuste Lineal (Azul) vs Cuadrático (Rojo)",
subtitle = "Variable: Analfabetismo")
p2=ggplot(Datos1, aes(x = Illiteracy, y = Murder)) +
geom_point() +
geom_smooth(method = "lm", color = "blue", se = FALSE) + # Línea recta
geom_smooth(method = "lm", formula = y ~ x + I(x^2), color = "red", se = FALSE) + # Curva
labs(title = "Ajuste Lineal (Azul) vs Cuadrático (Rojo)",
subtitle = "Variable: Analfabetismo")
grid.arrange(p1, p2, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
Visualmente, el ajuste cuadratico y la regresión lineal son muy similares, en otras palabras, el ajuste cuadratico no está aportando información relevante para explicar la variable de respuesta (murder), en este sentido hay que buscar una estrategia diferente que realmente nos muestre con precisión que variables pueden explicar estadisticamente la tasa de homicidios en esta base datos.
new <- data.frame(x=c(0.25, 1.2, 2.1, 3.0, 4.0))
predict(model, newdata = new)
## Warning: 'newdata' had 5 rows but variables found have 50 rows
## 1 2 3 4 5 6 7 8
## 2.0663127 1.6252368 1.2189826 1.4859497 1.5091642 1.1029100 0.6734414 1.0332665
## 9 10 11 12 13 14 15 16
## 1.5555932 1.9270256 1.0332665 0.9288011 1.5091642 1.1377318 0.5805833 0.8359430
## 17 18 19 20 21 22 23 24
## 1.5439860 1.8457748 0.6270123 1.3002335 0.6966559 1.6020223 0.5805833 1.7645239
## 25 26 27 28 29 30 31 32
## 1.3930916 0.8939793 0.6502268 1.6484513 0.6966559 0.9171938 1.4395206 1.5788077
## 33 34 35 36 37 38 39 40
## 1.6020223 0.4761179 1.1725536 1.0564810 0.8011212 1.0216592 0.5921905 1.6600586
## 41 42 43 44 45 46 47 48
## 0.5109397 1.5904150 1.7297021 0.8359430 0.9520156 1.4163061 0.8127285 1.0913028
## 49 50
## 0.6618341 1.1145173
De acuerdo a los valores proporcionados; los valores 1.2 y 2.1 se están interpolando, mientras que los valores 0.25, 3 y 4 se están extrapolando, esto se explica por el rango de valores que comprende nuestra variable independiente.