knitr::opts_chunk$set(fig.width=12, fig.height=8, fig.path='Figs/',
                      warning=FALSE, message=FALSE, error = FALSE)
  • ctrl+F for replacing(easy)
  • allEffects plots fo binary regression

Code for the first part does not contain any comments, cause everythings you can view by the LINK. I’ve tried to be short, however if 12 mins are too much, there are time stamps under the vid for some navigation. Enjoy~

my data

library(haven)
ds <- read_sav("C:/Users/ASUS/Documents/SCHOOL CHOICE/WORKING/SchoolChoice_dataclean_withisei_06172019.sav")

  #choice
ds$q24[ds$q24 == "1"] <- "Yes"
ds$q24[ds$q24 == "0"] <- "No"
ds$q24[ds$q24 == "4"] <- NA
ds$q24 <- as.factor(ds$q24)

  #educational capital
ds$momedu2[ds$momedu2 == "0"] <- "0-NO higher education"
ds$momedu2[ds$momedu2 == "1"] <- "1-Higher education"
ds$momedu2 <- as.factor(ds$momedu2) 

  #socio-economic capital
ds$ISEI <- (ds$ISEI_08_dad + ds$ISEI_08_mom)/2

  #cultural capital
ds$q41_22[ds$q41 == "1" | ds$q41 == "2" | ds$q41 == "3" ] <- "2 shelves and less (<80 books)"
ds$q41_22[ ds$q41 == "4" | ds$q41 == "5" | ds$q41 == "6" ] <- "3 shelves and more (<90 books)"
ds$q41_22 <- as.factor(ds$q41_22)

library(dplyr)
data_model <- ds %>% select(q24, momedu2, ISEI, q41_22) %>% na.omit()
names(data_model) <- c ("choice", "momedu", "ISEI", "books")
summary(data_model)
##  choice                      momedu         ISEI      
##  No :733   0-NO higher education:279   Min.   :23.00  
##  Yes:495   1-Higher education   :949   1st Qu.:43.00  
##                                        Median :52.00  
##                                        Mean   :51.19  
##                                        3rd Qu.:59.00  
##                                        Max.   :85.00  
##                             books    
##  2 shelves and less (<80 books):502  
##  3 shelves and more (<90 books):726  
##                                      
##                                      
##                                      
## 
#hist(data_model$ISEI)

library(effects) on the choice data

model_1 <- glm(choice ~ ISEI + momedu + books, data = data_model, family="binomial"(link=logit), na.action = na.exclude)
summary(model_1) # оразование матери (0.034), isei (0.06), культ капитал (больше половины - 0.002) AIC: 1633.2
## 
## Call:
## glm(formula = choice ~ ISEI + momedu + books, family = binomial(link = logit), 
##     data = data_model, na.action = na.exclude)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2314  -1.0434  -0.8628   1.2449   1.7272  
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)                         -1.51243    0.30187  -5.010 5.44e-07
## ISEI                                 0.01199    0.00650   1.845  0.06504
## momedu1-Higher education             0.35321    0.16667   2.119  0.03407
## books3 shelves and more (<90 books)  0.37397    0.12366   3.024  0.00249
##                                        
## (Intercept)                         ***
## ISEI                                .  
## momedu1-Higher education            *  
## books3 shelves and more (<90 books) ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1655.9  on 1227  degrees of freedom
## Residual deviance: 1625.4  on 1224  degrees of freedom
## AIC: 1633.4
## 
## Number of Fisher Scoring iterations: 4
#model_2 <- glm(choice ~ momedu + books, data = data_model, family="binomial"(link=logit), na.action = na.exclude)
#summary(model_2) 

#anova(model_1, model_2)

library(effects) # for beautiful regressions
plot(allEffects(model_1))

trust in politics data

library(haven)
df <- read.csv("C:/Users/ASUS/Documents/Data Analysis/gen/2018_var5_2.csv")
library(dplyr)
df1 <- df %>% select("pol_int", "marit", "age", "trstparl")
summary(df1)
##     pol_int            marit          age           trstparl     
##  Min.   :0.0000   Divorced: 97   Min.   :16.00   Min.   : 1.000  
##  1st Qu.:0.3300   Married : 52   1st Qu.:47.25   1st Qu.: 3.000  
##  Median :0.6600   Single  :154   Median :67.50   Median : 6.000  
##  Mean   :0.5036   Widowed :171   Mean   :61.41   Mean   : 5.249  
##  3rd Qu.:0.6600                  3rd Qu.:76.00   3rd Qu.: 7.000  
##  Max.   :1.0000                  Max.   :94.00   Max.   :11.000
model1 <- lm(data = df1, trstparl ~ pol_int)
summary(model1)
## 
## Call:
## lm(formula = trstparl ~ pol_int, data = df1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9639 -1.9639  0.5258  2.0011  6.4764 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.5236     0.2112   21.41  < 2e-16 ***
## pol_int       1.4403     0.3496    4.12 4.48e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.542 on 472 degrees of freedom
## Multiple R-squared:  0.03471,    Adjusted R-squared:  0.03267 
## F-statistic: 16.97 on 1 and 472 DF,  p-value: 4.478e-05
df2 <- df1
df2$trstparl[df2$trstparl >= 6] <- "trust"
df2$trstparl[df2$trstparl < 6] <- "dont"
df2$trstparl <- as.factor(df2$trstparl)

summary(df2)
##     pol_int            marit          age         trstparl  
##  Min.   :0.0000   Divorced: 97   Min.   :16.00   dont :228  
##  1st Qu.:0.3300   Married : 52   1st Qu.:47.25   trust:246  
##  Median :0.6600   Single  :154   Median :67.50              
##  Mean   :0.5036   Widowed :171   Mean   :61.41              
##  3rd Qu.:0.6600                  3rd Qu.:76.00              
##  Max.   :1.0000                  Max.   :94.00
model_21 <- glm(trstparl ~ pol_int, data = df2, family="binomial"(link=logit), na.action = na.exclude)
summary(model_21) # 
## 
## Call:
## glm(formula = trstparl ~ pol_int, family = binomial(link = logit), 
##     data = df2, na.action = na.exclude)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.357  -1.256   1.007   1.101   1.292  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -0.2652     0.1676  -1.583   0.1135  
## pol_int       0.6790     0.2785   2.438   0.0148 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 656.42  on 473  degrees of freedom
## Residual deviance: 650.40  on 472  degrees of freedom
## AIC: 654.4
## 
## Number of Fisher Scoring iterations: 4
#model_22 <- glm(trstparl ~ pol_int + marit, data = df2, family="binomial"(link=logit), na.action = na.exclude)
#summary(model_22)

#model_23 <- glm(trstparl ~ pol_int + marit + age, data = df2, family="binomial"(link=logit), na.action = na.exclude)
#summary(model_23)

#anova(model_21, model_22, model_23)

library(effects) 
plot(allEffects(model_21))

#plot(allEffects(model_23))

#check on linear version
plot(allEffects(model1))

#check on linear interactive model
model_int <- lm(data = df1, trstparl ~ pol_int * marit)
summary(model_int)
## 
## Call:
## lm(formula = trstparl ~ pol_int * marit, data = df1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1390 -1.8298  0.4323  1.8560  6.7438 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            3.2562     0.4496   7.242 1.84e-12 ***
## pol_int                2.8827     0.6982   4.129 4.32e-05 ***
## maritMarried           1.1170     0.8530   1.309 0.191022    
## maritSingle            1.2101     0.5609   2.157 0.031479 *  
## maritWidowed           2.2522     0.5872   3.836 0.000142 ***
## pol_int:maritMarried  -1.2656     1.3426  -0.943 0.346335    
## pol_int:maritSingle   -1.2532     0.9189  -1.364 0.173268    
## pol_int:maritWidowed  -2.7929     0.9335  -2.992 0.002919 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.517 on 466 degrees of freedom
## Multiple R-squared:  0.06506,    Adjusted R-squared:  0.05101 
## F-statistic: 4.632 on 7 and 466 DF,  p-value: 4.996e-05
plot(allEffects(model_int))

plot_int <- predictorEffect("marit", model_int)
plot(plot_int, lines=list(multiline=TRUE), confint=list(style="auto"))

plot_int2 <- predictorEffect("pol_int", model_int)
plot(plot_int2, lines=list(multiline=TRUE), confint=list(style="auto"))

trying out others’ RGems

Categorization of numeric variables with library(Hmisc)

the hint from Artyom Kulikov

I do really like the idea of easy cutting the numeric variable on categories, that is why I have decided to try Artyom’s hint. For that I’m using ISEI from my school data. To understand on how many caterogies I want to divide my variable I have built a histogram and tried out different bidwidth. Quite fast I’ve realised that I like the division by 10 levels of the index, which has made me 7 categories. By comparing my histogram before rewriting the variable and barplot after rewriting it we may see that the picture stays the same, while the type of the variable has been changed.

summary(data_model)
##  choice                      momedu         ISEI      
##  No :733   0-NO higher education:279   Min.   :23.00  
##  Yes:495   1-Higher education   :949   1st Qu.:43.00  
##                                        Median :52.00  
##                                        Mean   :51.19  
##                                        3rd Qu.:59.00  
##                                        Max.   :85.00  
##                             books    
##  2 shelves and less (<80 books):502  
##  3 shelves and more (<90 books):726  
##                                      
##                                      
##                                      
## 
library(ggplot2)
ggplot() + 
  geom_histogram(data = data_model, 
                 aes(x = ISEI), 
                 binwidth = 10, 
                 fill = "cornflowerblue", 
                 col= "black", 
                 alpha = 0.7) +
  labs(subtitle="mean=51.22 (white solid line), median=52 (white dashed line),\nmin.=23, 1st Qu.=43, 3rd Qu.=59, max.=85", 
       y="Number of respondents", 
       x="Family's socio-economic status", 
       title="ISEI for family") +
  geom_vline(aes(xintercept = mean(data_model$ISEI)), linetype="solid", color="white", size=1.5) +
  geom_vline(aes(xintercept = median(data_model$ISEI)), linetype="dashed", color="white", size=1.5) +
  theme_bw()

library(Hmisc)
data_model$ISEI_cat <- cut2(data_model$ISEI, c(25, 35, 45, 55, 65, 75))
summary(data_model$ISEI_cat)
## [23,25) [25,35) [35,45) [45,55) [55,65) [65,75) [75,85] 
##       3      79     270     388     351     133       4
levels(data_model$ISEI_cat) <- c("Extremely Low", "Very Low", "Almost Average", "Average", "Above Average", "Very High", "Extremely High")
summary(data_model$ISEI_cat)
##  Extremely Low       Very Low Almost Average        Average  Above Average 
##              3             79            270            388            351 
##      Very High Extremely High 
##            133              4
plot(data_model$ISEI_cat)

Table formatting with library(formattable)

the hint from Ksenia Bochkareva

In fact, I am familiar with the package and I know how cool is that. Let’s try it out. I have found this package very useful for presenting the residuals, because colors may help a lot it visualizing it in a table form.

For example, for my work I have done lots of chi-square test, and vizualization of residuals on plots I do not like (IMHO). I like tables and here it is.

In this part I was exploring the factors, that parents consider while choosing the school. And closeness is important for every parent, despite their level of SES. However, let us run the chi-square test to look at HOW important is it. The question for parents was to mark how important for them each of the factors and here are the results of the factor of closeness.

# closeness - без ВО ставят на первое место, ВО - на третье
t_momedu_27 <- with(ds, table(momedu2, q27.10)) # X-squared = 23.042, df = 3, p-value = 3.957e-05; 1 - !_HE, 3 - HE
t_momedu_27 <- na.omit(t_momedu_27)
chi_momedu_27 <- chisq.test(t_momedu_27)
chi_momedu_27
## 
##  Pearson's Chi-squared test
## 
## data:  t_momedu_27
## X-squared = 23.042, df = 3, p-value = 3.957e-05
chi_momedu_27$stdres %>% round(digits = 2)
##                        q27.10
## momedu2                     0     1     2     3
##   0-NO higher education -1.56  4.09 -0.80 -3.51
##   1-Higher education     1.56 -4.09  0.80  3.51
Mother_ed <- c("Has higher education", "Has NO higher education")
ignore <- c("1.56", "-1.56") %>% as.numeric()
imp1 <- c("-4.09 ", "4.09 ") %>% as.numeric()
imp2 <- c("0.80", "-0.80") %>% as.numeric()
imp3 <- c("3.51", "-3.51") %>% as.numeric()
res27.10 <- data.frame(Mother_ed, ignore, imp1, imp2, imp3)

res27.10

Above there is a table with resulduals. And we see, that while for mothers without higher education closeness is the first priority factor, mothers with higher education, relegate it to the background. I have driven that assumption by looking at the resulduals and comparing it with my threshold, which is |1.8|. And formattable package in this case would help me to highlight the nessesary impormation. Let’s do it.

library(formattable)
res27.1_10 <- formattable(res27.10,
                          align =c("r","c","c","c"),
                          list(
                            ignore = formatter("span", style = x ~ ifelse(x >= 1.8, "color:green", ifelse(x <= -1.8, "color:red", "color:grey"))),
                            imp1 = formatter("span", style = x ~ ifelse(x >= 1.8, "color:green", ifelse(x <= -1.8, "color:red", "color:grey"))),
                            imp2 = formatter("span", style = x ~ ifelse(x >= 1.8, "color:green", ifelse(x <= -1.8, "color:red", "color:grey"))),
                            imp3 = formatter("span", style = x ~ ifelse(x >= 1.8, "color:green", ifelse(x <= -1.8, "color:red", "color:grey")))))

res27.1_10
Mother_ed ignore imp1 imp2 imp3
Has higher education 1.56 -4.09 0.8 3.51
Has NO higher education -1.56 4.09 -0.8 -3.51

Сustomization of the template with CSS selectors

the hint from Liza Dyahenko and Anya Gorobtsova which can be seen on the top of my script