library(psych)
library(performance) #check_model(), check_collinearity, multicollinearity
library(lme4)
library(see)
library(car) #vif()
#library(qqconf)
#library(qqplotr)
library(dplyr)
library(mlogit)
library(MASS)
df1 = read.table("../data/DadosTaniaTorres/db_auto_auto_AJUSTE.dat", header=TRUE)
df1 = as.data.frame(df1)
df1$V_severidadeDiv = factor(df1$V_severidade)# df1$V_severidade/3
df1 = df1[sample(nrow(df1), 3000), ]
glimpse(df1)
## Rows: 3,000
## Columns: 38
## $ ID <dbl> 608922, 614656, 586282, 605520, 579102, 584803,…
## $ V_cruzamento <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,…
## $ V_find <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ V_severidade <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1,…
## $ V_chuva <dbl> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ V_dia <dbl> 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,…
## $ N_semaforos <dbl> 3, 6, 11, 6, 1, 13, 7, 4, 6, 12, 4, 2, 12, 11, …
## $ N_semaforos_cruzamento <dbl> 3, 1, 6, 5, 0, 11, 2, 2, 4, 12, 2, 2, 10, 5, 8,…
## $ N_comercio <dbl> 46, 61, 408, 62, 50, 580, 172, 96, 309, 202, 10…
## $ N_Industria <dbl> 4, 3, 11, 3, 4, 24, 13, 6, 7, 2, 8, 2, 5, 9, 19…
## $ N_servicos <dbl> 45, 40, 694, 111, 44, 1296, 162, 73, 687, 851, …
## $ N_comserv <dbl> 91, 101, 1102, 173, 94, 1876, 334, 169, 996, 10…
## $ N_escolas <dbl> 2, 1, 9, 3, 2, 3, 1, 4, 0, 4, 5, 6, 4, 4, 7, 5,…
## $ N_paradas <dbl> 304, 173, 424, 283, 114, 388, 271, 312, 133, 39…
## $ L_ciclo <dbl> 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,…
## $ L_bus <dbl> 798.9019, 1076.5627, 698.8840, 0.0000, 0.0000, …
## $ Declive_abs <dbl> 0.0034, 0.0955, 0.0046, 0.0466, 0.0570, 0.0195,…
## $ L_vias <dbl> 1059.6461, 1076.5651, 674.5346, 1538.0717, 630.…
## $ N_links <dbl> 16, 26, 24, 50, 48, 36, 58, 23, 29, 42, 24, 59,…
## $ Link_4 <dbl> 9, 2, 8, 12, 8, 18, 10, 14, 10, 21, 15, 13, 12,…
## $ P_link4 <dbl> 0.5625, 0.0769, 0.3333, 0.2400, 0.1667, 0.5000,…
## $ Comp_med <dbl> 66.2279, 41.4064, 28.1056, 30.7614, 13.1436, 43…
## $ P_arterial <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Idiv <dbl> 0.7388, 0.5900, 0.6447, 0.6345, 0.6092, 0.5933,…
## $ Pop <dbl> 33, 548, 331, 105, 101, 511, 72, 33, 627, 218, …
## $ P_idade_jov <dbl> 0.1737, 0.3006, 0.0761, 0.1537, 0.2285, 0.1181,…
## $ P_idade_ad <dbl> 0.7233, 0.6417, 0.6164, 0.7137, 0.6751, 0.6926,…
## $ P_idade_idos <dbl> 0.1030, 0.0577, 0.1125, 0.1326, 0.0965, 0.1894,…
## $ Renda_m <dbl> 299762.5, 372126.0, 699096.8, 1843607.9, 808879…
## $ ddom <dbl> 550.8712, 1286.3154, 9245.2359, 3898.1916, 2859…
## $ dpop <dbl> 1416.457, 4254.257, 17408.009, 9712.313, 8418.7…
## $ P_ilu_pub <dbl> 1.0000, 0.7214, 0.7856, 0.9999, 0.8921, 1.0000,…
## $ P_calc <dbl> 0.9981, 0.4139, 0.8050, 1.0000, 0.8356, 1.0000,…
## $ P_pav <dbl> 1.0000, 0.5370, 0.8050, 0.9984, 0.9029, 1.0000,…
## $ P_arbor <dbl> 0.9999, 0.7107, 0.7763, 0.9999, 0.8402, 0.9982,…
## $ P_rcadeirante <dbl> 0.4617, 0.0206, 0.5619, 0.2298, 0.1286, 0.2941,…
## $ P_MF <dbl> 1.0000, 0.4646, 0.8050, 0.9999, 0.8430, 1.0000,…
## $ V_severidadeDiv <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1,…
df2 = df1[,c("V_severidade", "P_arterial", "N_comercio", "N_semaforos", "dpop")]
df2$V_severidade = factor(df2$V_severidade)
head(df2)
pairs.panels(df1[,3:15])
round(cor(df1[, 1:10]), 2)
## ID V_cruzamento V_find V_severidade V_chuva V_dia
## ID 1.00 0.13 -0.02 -0.01 0.07 -0.05
## V_cruzamento 0.13 1.00 0.01 0.16 -0.04 -0.05
## V_find -0.02 0.01 1.00 0.10 0.02 -0.14
## V_severidade -0.01 0.16 0.10 1.00 -0.05 -0.16
## V_chuva 0.07 -0.04 0.02 -0.05 1.00 -0.04
## V_dia -0.05 -0.05 -0.14 -0.16 -0.04 1.00
## N_semaforos -0.04 -0.07 0.01 -0.06 0.01 -0.02
## N_semaforos_cruzamento -0.03 -0.10 0.00 -0.05 0.02 -0.01
## N_comercio -0.04 -0.05 -0.01 -0.02 -0.02 0.01
## N_Industria -0.04 -0.11 -0.04 0.00 -0.02 0.03
## N_semaforos N_semaforos_cruzamento N_comercio
## ID -0.04 -0.03 -0.04
## V_cruzamento -0.07 -0.10 -0.05
## V_find 0.01 0.00 -0.01
## V_severidade -0.06 -0.05 -0.02
## V_chuva 0.01 0.02 -0.02
## V_dia -0.02 -0.01 0.01
## N_semaforos 1.00 0.83 0.57
## N_semaforos_cruzamento 0.83 1.00 0.63
## N_comercio 0.57 0.63 1.00
## N_Industria 0.30 0.31 0.60
## N_Industria
## ID -0.04
## V_cruzamento -0.11
## V_find -0.04
## V_severidade 0.00
## V_chuva -0.02
## V_dia 0.03
## N_semaforos 0.30
## N_semaforos_cruzamento 0.31
## N_comercio 0.60
## N_Industria 1.00
m1 <- lm(V_severidade ~ P_arterial + N_comercio + N_semaforos + dpop, data = df1)
summary(m1)
##
## Call:
## lm(formula = V_severidade ~ P_arterial + N_comercio + N_semaforos +
## dpop, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.15682 -0.10685 -0.09423 -0.07849 1.91825
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.154e+00 3.351e-02 34.431 <2e-16 ***
## P_arterial -3.610e-02 3.210e-02 -1.125 0.2608
## N_comercio 8.791e-06 2.512e-05 0.350 0.7264
## N_semaforos -3.037e-03 1.219e-03 -2.492 0.0128 *
## dpop 1.653e-07 1.234e-06 0.134 0.8935
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2977 on 2995 degrees of freedom
## Multiple R-squared: 0.003698, Adjusted R-squared: 0.002368
## F-statistic: 2.779 on 4 and 2995 DF, p-value: 0.02545
model_performance(m1)
check_model(m1)
car::vif(m1)
## P_arterial N_comercio N_semaforos dpop
## 1.102307 1.597426 1.699022 1.316174
check_collinearity(m1)
m2 <- lm(V_severidade ~ P_arterial + N_comercio + Comp_med + V_chuva, data = df1)
summary(m2)
##
## Call:
## lm(formula = V_severidade ~ P_arterial + N_comercio + Comp_med +
## V_chuva, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16007 -0.10370 -0.09994 -0.08787 1.91948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.160e+00 3.144e-02 36.900 <2e-16 ***
## P_arterial -5.289e-02 3.115e-02 -1.698 0.0896 .
## N_comercio -2.912e-05 2.021e-05 -1.441 0.1497
## Comp_med -1.073e-05 9.331e-05 -0.115 0.9085
## V_chuva -4.153e-02 1.625e-02 -2.556 0.0107 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2977 on 2995 degrees of freedom
## Multiple R-squared: 0.003669, Adjusted R-squared: 0.002338
## F-statistic: 2.757 on 4 and 2995 DF, p-value: 0.02643
m3 <- lm(V_severidade ~ P_arterial + N_links + N_semaforos_cruzamento + N_semaforos, data = df1)
summary(m3)
##
## Call:
## lm(formula = V_severidade ~ P_arterial + N_links + N_semaforos_cruzamento +
## N_semaforos, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.19522 -0.10574 -0.09348 -0.07868 1.92611
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1448132 0.0341984 33.476 <2e-16 ***
## P_arterial -0.0382320 0.0310646 -1.231 0.219
## N_links 0.0003500 0.0003619 0.967 0.333
## N_semaforos_cruzamento -0.0010075 0.0022842 -0.441 0.659
## N_semaforos -0.0022514 0.0017078 -1.318 0.188
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2976 on 2995 degrees of freedom
## Multiple R-squared: 0.003998, Adjusted R-squared: 0.002667
## F-statistic: 3.005 on 4 and 2995 DF, p-value: 0.01735
compare_performance(m1, m2, m3,rank = TRUE)
plot(compare_performance(m1, m2, m3))
modelo_probit<- glm(factor(V_severidadeDiv) ~ P_arterial + N_comercio + N_semaforos + dpop, family = binomial(link = "probit"), data = df1)
modelo_probit
##
## Call: glm(formula = factor(V_severidadeDiv) ~ P_arterial + N_comercio +
## N_semaforos + dpop, family = binomial(link = "probit"), data = df1)
##
## Coefficients:
## (Intercept) P_arterial N_comercio N_semaforos dpop
## -1.018e+00 -1.741e-01 3.194e-05 -1.893e-02 3.054e-06
##
## Degrees of Freedom: 2999 Total (i.e. Null); 2995 Residual
## Null Deviance: 1884
## Residual Deviance: 1872 AIC: 1882
summary(modelo_probit)
##
## Call:
## glm(formula = factor(V_severidadeDiv) ~ P_arterial + N_comercio +
## N_semaforos + dpop, family = binomial(link = "probit"), data = df1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6039 -0.4715 -0.4392 -0.4007 2.3883
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.018e+00 1.778e-01 -5.727 1.02e-08 ***
## P_arterial -1.741e-01 1.690e-01 -1.031 0.3027
## N_comercio 3.194e-05 1.500e-04 0.213 0.8314
## N_semaforos -1.893e-02 7.358e-03 -2.573 0.0101 *
## dpop 3.054e-06 7.257e-06 0.421 0.6739
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1883.7 on 2999 degrees of freedom
## Residual deviance: 1871.9 on 2995 degrees of freedom
## AIC: 1881.9
##
## Number of Fisher Scoring iterations: 5
check_collinearity(modelo_probit)
exp(modelo_probit$coefficients)
## (Intercept) P_arterial N_comercio N_semaforos dpop
## 0.3612298 0.8401711 1.0000319 0.9812462 1.0000031
coef(m1)
## (Intercept) P_arterial N_comercio N_semaforos dpop
## 1.153917e+00 -3.610171e-02 8.790878e-06 -3.036673e-03 1.652938e-07
ProbitScalar <- mean(dlogis(predict(modelo_probit, type = "link")))
ProbitScalar * coef(modelo_probit)
## (Intercept) P_arterial N_comercio N_semaforos dpop
## -1.695769e-01 -2.900273e-02 5.318546e-06 -3.152898e-03 5.085714e-07
polsreg<- predict(m1)
summary(m1)
##
## Call:
## lm(formula = V_severidade ~ P_arterial + N_comercio + N_semaforos +
## dpop, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.15682 -0.10685 -0.09423 -0.07849 1.91825
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.154e+00 3.351e-02 34.431 <2e-16 ***
## P_arterial -3.610e-02 3.210e-02 -1.125 0.2608
## N_comercio 8.791e-06 2.512e-05 0.350 0.7264
## N_semaforos -3.037e-03 1.219e-03 -2.492 0.0128 *
## dpop 1.653e-07 1.234e-06 0.134 0.8935
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2977 on 2995 degrees of freedom
## Multiple R-squared: 0.003698, Adjusted R-squared: 0.002368
## F-statistic: 2.779 on 4 and 2995 DF, p-value: 0.02545
pprobit<- predict(modelo_probit, type="response")
summary(pprobit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04775 0.08223 0.09560 0.09500 0.10744 0.16669
as.data.frame(table(true = df1$V_severidadeDiv, pred = round(fitted(modelo_probit))))
probit0<-update(modelo_probit, formula= df1$V_severidadeDiv ~ 1)
McFadden<- 1-as.vector(logLik(modelo_probit)/logLik(probit0))
McFadden
## [1] 0.00627975
m1 <- polr(factor(V_severidade) ~ P_arterial + N_comercio + N_semaforos + dpop, data = df1, Hess=TRUE)
summary(m1)
## Call:
## polr(formula = factor(V_severidade) ~ P_arterial + N_comercio +
## N_semaforos + dpop, data = df1, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## P_arterial -3.290e-01 0.0002930 -1122.7660
## N_comercio 6.070e-05 0.0002663 0.2280
## N_semaforos -3.777e-02 0.0171080 -2.2079
## dpop 4.999e-06 0.0000220 0.2272
##
## Intercepts:
## Value Std. Error t value
## 1|2 1.6823 0.0006 2714.6192
## 2|3 6.3397 0.0055 1151.3460
##
## Residual Deviance: 1905.073
## AIC: 1917.073
model_performance(m1)
## Can't calculate log-loss.
## Can't calculate proper scoring rules for ordinal, multinomial or cumulative link models.
car::vif(m1)
## P_arterial N_comercio N_semaforos dpop
## -1.039454e+16 -2.671639e+09 -1.039730e+16 -1.632974e+06
m1.coef <- data.frame(coef(summary(m1)))
m1.coef$pval = round((pnorm(abs(m1.coef$t.value), lower.tail = FALSE) * 2),2)
m1.coef
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
stargazer(m1)
##
## % Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
## % Date and time: Mon, May 01, 2023 - 22:00:18
## \begin{table}[!htbp] \centering
## \caption{}
## \label{}
## \begin{tabular}{@{\extracolsep{5pt}}lc}
## \\[-1.8ex]\hline
## \hline \\[-1.8ex]
## & \multicolumn{1}{c}{\textit{Dependent variable:}} \\
## \cline{2-2}
## \\[-1.8ex] & V\_severidade \\
## \hline \\[-1.8ex]
## P\_arterial & $-$0.329$^{***}$ \\
## & (0.0003) \\
## & \\
## N\_comercio & 0.0001 \\
## & (0.0003) \\
## & \\
## N\_semaforos & $-$0.038$^{**}$ \\
## & (0.017) \\
## & \\
## dpop & 0.00000 \\
## & (0.00002) \\
## & \\
## \hline \\[-1.8ex]
## Observations & 3,000 \\
## \hline
## \hline \\[-1.8ex]
## \textit{Note:} & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\
## \end{tabular}
## \end{table}
m1.or=exp(coef(m1))
m1.or
## P_arterial N_comercio N_semaforos dpop
## 0.7196419 1.0000607 0.9629313 1.0000050
m1.pred <- predict(m1, type="probs")
summary(m1.pred)
## 1 2 3
## Min. :0.8318 Min. :0.04815 Min. :0.0004850
## 1st Qu.:0.8924 1st Qu.:0.08125 1st Qu.:0.0008481
## Median :0.9048 Median :0.09417 Median :0.0009972
## Mean :0.9050 Mean :0.09399 Mean :0.0009996
## 3rd Qu.:0.9179 3rd Qu.:0.10643 3rd Qu.:0.0011428
## Max. :0.9514 Max. :0.16624 Max. :0.0019149
setup1 <- data.frame(V_find=c(0,1), P_arterial=c(0,1), N_comercio=sample.int(max(df1$N_comercio), 2),N_semaforos=sample.int(max(df1$N_semaforos), 2), dpop=sample.int(max(df1$Pop),2))
setup1[, c("pred.prob")] <- predict(m1, newdata=setup1, type="probs")
setup1
setup1[, c("pred.prob")] <- predict(m1, newdata=setup1, type="class")
setup1
library(erer)
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.0.5
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
x <- ocME(m1)
x
## effect.1 effect.2 effect.3
## P_arterial 0.032 -0.031 0
## N_comercio 0.000 0.000 0
## N_semaforos 0.003 -0.003 0
## dpop 0.000 0.000 0
x$out
## $ME.1
## effect error t.value p.value
## P_arterial 0.032 0.002 16.093 0.000
## N_comercio 0.000 0.000 -0.229 0.819
## N_semaforos 0.003 0.002 2.105 0.035
## dpop 0.000 0.000 -0.225 0.822
##
## $ME.2
## effect error t.value p.value
## P_arterial -0.031 0.002 -16.151 0.000
## N_comercio 0.000 0.000 0.228 0.819
## N_semaforos -0.003 0.002 -2.106 0.035
## dpop 0.000 0.000 0.225 0.822
##
## $ME.3
## effect error t.value p.value
## P_arterial 0 0 -12.403 0.000
## N_comercio 0 0 0.229 0.819
## N_semaforos 0 0 -2.058 0.040
## dpop 0 0 0.224 0.823
##
## $ME.all
## effect.1 effect.2 effect.3
## P_arterial 0.032 -0.031 0
## N_comercio 0.000 0.000 0
## N_semaforos 0.003 -0.003 0
## dpop 0.000 0.000 0
library(foreign)
library(nnet)
library(stargazer)
#load("hsb2.rda")
#mydata = hsb2
table(df1$V_severidade)
##
## 1 2 3
## 2715 282 3
#df1$V_severidade = relevel(df1$V_severidade , ref = "2")
multi1 = multinom(V_severidade ~ P_arterial + N_comercio + N_semaforos + dpop, data=df1)
## # weights: 18 (10 variable)
## initial value 3295.836866
## iter 10 value 1330.130676
## iter 20 value 965.505399
## iter 30 value 949.672557
## iter 40 value 949.312664
## final value 949.305278
## converged
summary(multi1)
## Call:
## multinom(formula = V_severidade ~ P_arterial + N_comercio + N_semaforos +
## dpop, data = df1)
##
## Coefficients:
## (Intercept) P_arterial N_comercio N_semaforos dpop
## 2 -1.688523 -0.3388964 -1.446696e-05 -0.03835034 8.130551e-06
## 3 -11.175336 4.8400685 2.215822e-03 0.06007690 -2.671274e-04
##
## Std. Errors:
## (Intercept) P_arterial N_comercio N_semaforos dpop
## 2 5.363579e-04 6.897839e-04 0.0003028625 1.320964e-02 1.160083e-05
## 3 9.948649e-07 9.980798e-07 0.0010312603 1.587701e-05 1.237549e-04
##
## Residual Deviance: 1898.611
## AIC: 1918.611
model_performance(multi1)
## Can't calculate log-loss.
## Can't calculate proper scoring rules for ordinal, multinomial or cumulative link models.
car::vif(multi1)
## Warning in vif.default(multi1): No intercept: vifs may not be sensible.
## P_arterial N_comercio N_semaforos dpop
## -1.128740e+15 -4.686312e+07 -3.906456e+15 6.491845e+03
multi1.rrr = exp(coef(multi1))
multi1.rrr
## (Intercept) P_arterial N_comercio N_semaforos dpop
## 2 1.847923e-01 0.7125562 0.9999855 0.9623757 1.0000081
## 3 1.401565e-05 126.4780126 1.0022183 1.0619182 0.9997329
df <- data.frame(V_find=c(0,1), P_arterial=c(0,1), N_comercio=sample.int(max(df1$N_comercio), 2),N_semaforos=sample.int(max(df1$N_semaforos), 2), dpop=sample.int(max(df1$Pop),2))
df
df[, c("pred.prob")] <- predict(multi1, newdata=df, type="probs")
df
df[, c("pred.prob")] <- predict(multi1, newdata=df, type="class")
df
compare_performance(m1,multi1,rank = TRUE)
## Following indices with missing values are not used for ranking:
## R2_Nagelkerke, R2, R2_adjusted
plot(compare_performance(m1, multi1))