knitr::opts_chunk$set(fig.width=12, fig.height=8, fig.path='Figs/',
warning=FALSE, message=FALSE, error = FALSE)
Code for the first part does not contain any comments, cause everythings you can view by the LINK. I’ve tried to be short, however if 12 mins are too much, there are time stamps under the vid for some navigation. Enjoy~
library(haven)
ds <- read_sav("C:/Users/ASUS/Documents/SCHOOL CHOICE/WORKING/SchoolChoice_dataclean_withisei_06172019.sav")
#choice
ds$q24[ds$q24 == "1"] <- "Yes"
ds$q24[ds$q24 == "0"] <- "No"
ds$q24[ds$q24 == "4"] <- NA
ds$q24 <- as.factor(ds$q24)
#educational capital
ds$momedu2[ds$momedu2 == "0"] <- "0-NO higher education"
ds$momedu2[ds$momedu2 == "1"] <- "1-Higher education"
ds$momedu2 <- as.factor(ds$momedu2)
#socio-economic capital
ds$ISEI <- (ds$ISEI_08_dad + ds$ISEI_08_mom)/2
#cultural capital
ds$q41_22[ds$q41 == "1" | ds$q41 == "2" | ds$q41 == "3" ] <- "2 shelves and less (<80 books)"
ds$q41_22[ ds$q41 == "4" | ds$q41 == "5" | ds$q41 == "6" ] <- "3 shelves and more (<90 books)"
ds$q41_22 <- as.factor(ds$q41_22)
library(dplyr)
data_model <- ds %>% select(q24, momedu2, ISEI, q41_22) %>% na.omit()
names(data_model) <- c ("choice", "momedu", "ISEI", "books")
summary(data_model)
## choice momedu ISEI
## No :733 0-NO higher education:279 Min. :23.00
## Yes:495 1-Higher education :949 1st Qu.:43.00
## Median :52.00
## Mean :51.19
## 3rd Qu.:59.00
## Max. :85.00
## books
## 2 shelves and less (<80 books):502
## 3 shelves and more (<90 books):726
##
##
##
##
#hist(data_model$ISEI)
model_1 <- glm(choice ~ ISEI + momedu + books, data = data_model, family="binomial"(link=logit), na.action = na.exclude)
summary(model_1) # оразование матери (0.034), isei (0.06), культ капитал (больше половины - 0.002) AIC: 1633.2
##
## Call:
## glm(formula = choice ~ ISEI + momedu + books, family = binomial(link = logit),
## data = data_model, na.action = na.exclude)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2314 -1.0434 -0.8628 1.2449 1.7272
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.51243 0.30187 -5.010 5.44e-07
## ISEI 0.01199 0.00650 1.845 0.06504
## momedu1-Higher education 0.35321 0.16667 2.119 0.03407
## books3 shelves and more (<90 books) 0.37397 0.12366 3.024 0.00249
##
## (Intercept) ***
## ISEI .
## momedu1-Higher education *
## books3 shelves and more (<90 books) **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1655.9 on 1227 degrees of freedom
## Residual deviance: 1625.4 on 1224 degrees of freedom
## AIC: 1633.4
##
## Number of Fisher Scoring iterations: 4
#model_2 <- glm(choice ~ momedu + books, data = data_model, family="binomial"(link=logit), na.action = na.exclude)
#summary(model_2)
#anova(model_1, model_2)
library(effects) # for beautiful regressions
plot(allEffects(model_1))
library(haven)
df <- read.csv("C:/Users/ASUS/Documents/Data Analysis/gen/2018_var5_2.csv")
library(dplyr)
df1 <- df %>% select("pol_int", "marit", "age", "trstparl")
summary(df1)
## pol_int marit age trstparl
## Min. :0.0000 Divorced: 97 Min. :16.00 Min. : 1.000
## 1st Qu.:0.3300 Married : 52 1st Qu.:47.25 1st Qu.: 3.000
## Median :0.6600 Single :154 Median :67.50 Median : 6.000
## Mean :0.5036 Widowed :171 Mean :61.41 Mean : 5.249
## 3rd Qu.:0.6600 3rd Qu.:76.00 3rd Qu.: 7.000
## Max. :1.0000 Max. :94.00 Max. :11.000
model1 <- lm(data = df1, trstparl ~ pol_int)
summary(model1)
##
## Call:
## lm(formula = trstparl ~ pol_int, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9639 -1.9639 0.5258 2.0011 6.4764
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.5236 0.2112 21.41 < 2e-16 ***
## pol_int 1.4403 0.3496 4.12 4.48e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.542 on 472 degrees of freedom
## Multiple R-squared: 0.03471, Adjusted R-squared: 0.03267
## F-statistic: 16.97 on 1 and 472 DF, p-value: 4.478e-05
df2 <- df1
df2$trstparl[df2$trstparl >= 6] <- "trust"
df2$trstparl[df2$trstparl < 6] <- "dont"
df2$trstparl <- as.factor(df2$trstparl)
summary(df2)
## pol_int marit age trstparl
## Min. :0.0000 Divorced: 97 Min. :16.00 dont :228
## 1st Qu.:0.3300 Married : 52 1st Qu.:47.25 trust:246
## Median :0.6600 Single :154 Median :67.50
## Mean :0.5036 Widowed :171 Mean :61.41
## 3rd Qu.:0.6600 3rd Qu.:76.00
## Max. :1.0000 Max. :94.00
model_21 <- glm(trstparl ~ pol_int, data = df2, family="binomial"(link=logit), na.action = na.exclude)
summary(model_21) #
##
## Call:
## glm(formula = trstparl ~ pol_int, family = binomial(link = logit),
## data = df2, na.action = na.exclude)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.357 -1.256 1.007 1.101 1.292
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.2652 0.1676 -1.583 0.1135
## pol_int 0.6790 0.2785 2.438 0.0148 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 656.42 on 473 degrees of freedom
## Residual deviance: 650.40 on 472 degrees of freedom
## AIC: 654.4
##
## Number of Fisher Scoring iterations: 4
#model_22 <- glm(trstparl ~ pol_int + marit, data = df2, family="binomial"(link=logit), na.action = na.exclude)
#summary(model_22)
#model_23 <- glm(trstparl ~ pol_int + marit + age, data = df2, family="binomial"(link=logit), na.action = na.exclude)
#summary(model_23)
#anova(model_21, model_22, model_23)
library(effects)
plot(allEffects(model_21))
#plot(allEffects(model_23))
#check on linear version
plot(allEffects(model1))
#check on linear interactive model
model_int <- lm(data = df1, trstparl ~ pol_int * marit)
summary(model_int)
##
## Call:
## lm(formula = trstparl ~ pol_int * marit, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1390 -1.8298 0.4323 1.8560 6.7438
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2562 0.4496 7.242 1.84e-12 ***
## pol_int 2.8827 0.6982 4.129 4.32e-05 ***
## maritMarried 1.1170 0.8530 1.309 0.191022
## maritSingle 1.2101 0.5609 2.157 0.031479 *
## maritWidowed 2.2522 0.5872 3.836 0.000142 ***
## pol_int:maritMarried -1.2656 1.3426 -0.943 0.346335
## pol_int:maritSingle -1.2532 0.9189 -1.364 0.173268
## pol_int:maritWidowed -2.7929 0.9335 -2.992 0.002919 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.517 on 466 degrees of freedom
## Multiple R-squared: 0.06506, Adjusted R-squared: 0.05101
## F-statistic: 4.632 on 7 and 466 DF, p-value: 4.996e-05
plot(allEffects(model_int))
plot_int <- predictorEffect("marit", model_int)
plot(plot_int, lines=list(multiline=TRUE), confint=list(style="auto"))
plot_int2 <- predictorEffect("pol_int", model_int)
plot(plot_int2, lines=list(multiline=TRUE), confint=list(style="auto"))
the hint from Artyom Kulikov
I do really like the idea of easy cutting the numeric variable on categories, that is why I have decided to try Artyom’s hint. For that I’m using ISEI from my school data. To understand on how many caterogies I want to divide my variable I have built a histogram and tried out different bidwidth. Quite fast I’ve realised that I like the division by 10 levels of the index, which has made me 7 categories. By comparing my histogram before rewriting the variable and barplot after rewriting it we may see that the picture stays the same, while the type of the variable has been changed.
summary(data_model)
## choice momedu ISEI
## No :733 0-NO higher education:279 Min. :23.00
## Yes:495 1-Higher education :949 1st Qu.:43.00
## Median :52.00
## Mean :51.19
## 3rd Qu.:59.00
## Max. :85.00
## books
## 2 shelves and less (<80 books):502
## 3 shelves and more (<90 books):726
##
##
##
##
library(ggplot2)
ggplot() +
geom_histogram(data = data_model,
aes(x = ISEI),
binwidth = 10,
fill = "cornflowerblue",
col= "black",
alpha = 0.7) +
labs(subtitle="mean=51.22 (white solid line), median=52 (white dashed line),\nmin.=23, 1st Qu.=43, 3rd Qu.=59, max.=85",
y="Number of respondents",
x="Family's socio-economic status",
title="ISEI for family") +
geom_vline(aes(xintercept = mean(data_model$ISEI)), linetype="solid", color="white", size=1.5) +
geom_vline(aes(xintercept = median(data_model$ISEI)), linetype="dashed", color="white", size=1.5) +
theme_bw()
library(Hmisc)
data_model$ISEI_cat <- cut2(data_model$ISEI, c(25, 35, 45, 55, 65, 75))
summary(data_model$ISEI_cat)
## [23,25) [25,35) [35,45) [45,55) [55,65) [65,75) [75,85]
## 3 79 270 388 351 133 4
levels(data_model$ISEI_cat) <- c("Extremely Low", "Very Low", "Almost Average", "Average", "Above Average", "Very High", "Extremely High")
summary(data_model$ISEI_cat)
## Extremely Low Very Low Almost Average Average Above Average
## 3 79 270 388 351
## Very High Extremely High
## 133 4
plot(data_model$ISEI_cat)
the hint from Ksenia Bochkareva
In fact, I am familiar with the package and I know how cool is that. Let’s try it out. I have found this package very useful for presenting the residuals, because colors may help a lot it visualizing it in a table form.
For example, for my work I have done lots of chi-square test, and vizualization of residuals on plots I do not like (IMHO). I like tables and here it is.
In this part I was exploring the factors, that parents consider while choosing the school. And closeness is important for every parent, despite their level of SES. However, let us run the chi-square test to look at HOW important is it. The question for parents was to mark how important for them each of the factors and here are the results of the factor of closeness.
# closeness - без ВО ставят на первое место, ВО - на третье
t_momedu_27 <- with(ds, table(momedu2, q27.10)) # X-squared = 23.042, df = 3, p-value = 3.957e-05; 1 - !_HE, 3 - HE
t_momedu_27 <- na.omit(t_momedu_27)
chi_momedu_27 <- chisq.test(t_momedu_27)
chi_momedu_27
##
## Pearson's Chi-squared test
##
## data: t_momedu_27
## X-squared = 23.042, df = 3, p-value = 3.957e-05
chi_momedu_27$stdres %>% round(digits = 2)
## q27.10
## momedu2 0 1 2 3
## 0-NO higher education -1.56 4.09 -0.80 -3.51
## 1-Higher education 1.56 -4.09 0.80 3.51
Mother_ed <- c("Has higher education", "Has NO higher education")
ignore <- c("1.56", "-1.56") %>% as.numeric()
imp1 <- c("-4.09 ", "4.09 ") %>% as.numeric()
imp2 <- c("0.80", "-0.80") %>% as.numeric()
imp3 <- c("3.51", "-3.51") %>% as.numeric()
res27.10 <- data.frame(Mother_ed, ignore, imp1, imp2, imp3)
res27.10
Above there is a table with resulduals. And we see, that while for mothers without higher education closeness is the first priority factor, mothers with higher education, relegate it to the background. I have driven that assumption by looking at the resulduals and comparing it with my threshold, which is |1.8|. And formattable package in this case would help me to highlight the nessesary impormation. Let’s do it.
library(formattable)
res27.1_10 <- formattable(res27.10,
align =c("r","c","c","c"),
list(
ignore = formatter("span", style = x ~ ifelse(x >= 1.8, "color:green", ifelse(x <= -1.8, "color:red", "color:grey"))),
imp1 = formatter("span", style = x ~ ifelse(x >= 1.8, "color:green", ifelse(x <= -1.8, "color:red", "color:grey"))),
imp2 = formatter("span", style = x ~ ifelse(x >= 1.8, "color:green", ifelse(x <= -1.8, "color:red", "color:grey"))),
imp3 = formatter("span", style = x ~ ifelse(x >= 1.8, "color:green", ifelse(x <= -1.8, "color:red", "color:grey")))))
res27.1_10
| Mother_ed | ignore | imp1 | imp2 | imp3 |
|---|---|---|---|---|
| Has higher education | 1.56 | -4.09 | 0.8 | 3.51 |
| Has NO higher education | -1.56 | 4.09 | -0.8 | -3.51 |
the hint from Liza Dyahenko and Anya Gorobtsova which can be seen on the top of my script