title: ‘22041’ output: html_document: default —
library(dplyr)
library(ggplot2)
library(stringr)
library(readr)
library(sjlabelled)
library(sjmisc)
library(sjstats)
library(ggeffects)
library(sjPlot)
library(knitr)
library(psych)
library(magrittr)
library(kableExtra)
library(tidytext)
library(RColorBrewer)
library(haven)
df <- read_csv("D:/Documents/data1_1.csv")
ESS5e03 <- read_sav("D:/Documents/ESS5e03.sav")
# Read the literature and jutify what you do and want to expect
# Draw scatterplots to make sure further analysis is necessary
# Estimate the regression model
# Test the overall quality of the regression model
# Interpret regression coefficients (point estimates andconfidence intervals)
# Compare the strength of association with standardized coefficients
# Predict the outcome
df$books = as.numeric(as.character(df$books))
df$attend = as.numeric(as.character(df$attend))
df$grade = as.numeric(as.character(df$grade))
str(df)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 40 obs. of 3 variables:
## $ books : num 0 1 0 2 4 4 1 4 3 0 ...
## $ attend: num 9 15 10 16 10 20 11 20 15 15 ...
## $ grade : num 45 57 45 51 65 88 44 87 89 59 ...
## - attr(*, "spec")=
## .. cols(
## .. books = col_double(),
## .. attend = col_double(),
## .. grade = col_double()
## .. )
summary(df)
## books attend grade
## Min. :0 Min. : 6.00 Min. :37.00
## 1st Qu.:1 1st Qu.:10.00 1st Qu.:51.00
## Median :2 Median :15.00 Median :60.50
## Mean :2 Mean :14.10 Mean :63.55
## 3rd Qu.:3 3rd Qu.:18.25 3rd Qu.:73.00
## Max. :4 Max. :20.00 Max. :97.00
par(mfrow=c(1, 3))
boxplot(df$books, main="books", sub=paste("Outlier rows: ", boxplot.stats(df$books)$out))
boxplot(df$attend, main="attend", sub=paste("Outlier rows: ", boxplot.stats(df$attend)$out))
boxplot(df$grade, main="grade", sub=paste("Outlier rows: ", boxplot.stats(df$grade)$out))
library(cowplot)
##
## Attaching package: 'cowplot'
## The following objects are masked from 'package:sjPlot':
##
## plot_grid, save_plot
## The following object is masked from 'package:ggplot2':
##
## ggsave
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
bo = ggplot(data = df, aes(x = books)) + geom_density(col = "pink", fill = "pink", alpha = 0.5)
at = ggplot(data = df, aes(x = attend)) + geom_density(col = "blue", fill = "blue", alpha = 0.5)
gr = ggplot(data = df, aes(x = grade)) + geom_density(col = "yellow", fill = "yellow", alpha = 0.5)
plot_grid(bo, at, gr)
sc1 = ggplot(data = df, aes(x = books, y = grade)) + geom_point() + geom_smooth(method = lm, fill="blue", color="blue", se = FALSE)
sc2 = ggplot(data = df, aes(x = attend, y = grade)) + geom_point() + geom_smooth(method = lm, fill="blue", color="blue", se = FALSE)
sc3 = ggplot(data = df, aes(x = attend, y = books)) + geom_point() + geom_smooth(method=lm, fill="blue", color="blue", se = FALSE)
plot_grid(sc1, sc2, sc3)
q = cor(df)
sjp.corr(df, show.legend = TRUE)
## Computing correlation using pearson-method with listwise-deletion...
## Warning: Removed 6 rows containing missing values (geom_text).
model1 = lm(grade ~ books, data = df)
model2 = lm(grade ~ attend, data = df)
model3 = lm(grade ~ books + attend, data = df)
library(snakecase)
sjPlot::tab_model(model1)
| grade | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 52.07 | 44.17 – 59.98 | <0.001 |
| books | 5.74 | 2.51 – 8.97 | 0.001 |
| Observations | 40 | ||
| R2 / adjusted R2 | 0.242 / 0.222 | ||
sjPlot::tab_model(model2)
| grade | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 37.00 | 20.99 – 53.01 | <0.001 |
| attend | 1.88 | 0.80 – 2.97 | 0.002 |
| Observations | 40 | ||
| R2 / adjusted R2 | 0.233 / 0.212 | ||
sjPlot::tab_model(model3)
| grade | |||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 37.38 | 22.20 – 52.56 | <0.001 |
| books | 4.04 | 0.60 – 7.47 | 0.027 |
| attend | 1.28 | 0.13 – 2.43 | 0.035 |
| Observations | 40 | ||
| R2 / adjusted R2 | 0.329 / 0.292 | ||
# the algorithm for summary2
#F-statistics to look at P-value (PR(>F)) - if significant, the model is OK
#Adjusted R squared: to look how good the model is: it is : how many percent the first variable explains the second
#Estimate: the intercept if the variable if equal to 0. In the case of grade and attend(2), if we do not atend classes we receive 37. If we attend 2 classes, it will be 37 + 2*1.883
#Compare models. They should be Nested models.
anova(model1, model3)
## Analysis of Variance Table
##
## Model 1: grade ~ books
## Model 2: grade ~ books + attend
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 38 8250.4
## 2 37 7306.2 1 944.16 4.7814 0.03517 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model2, model3)
## Analysis of Variance Table
##
## Model 1: grade ~ attend
## Model 2: grade ~ books + attend
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 38 8353.4
## 2 37 7306.2 1 1047.1 5.3028 0.02701 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There is a helpful LINE mnemonic used for the Regression Assumptions:
The independence assumption is held because… Others can be checked by looking at the residuals of the models.
par(mfrow = c(2, 2))
plot(model1)
а потом еще две модели)))0 ** но я не уверена, что надо рассматривать все!!! мб только одной ограничиться? хотя хз, короче**
par(mfrow = c(2, 2))
plot(model2)
выводы 2
par(mfrow = c(2, 2))
plot(model3)
выводы 3
Здесь шота про бэкграунд
#data <- read_sav("ESS5e03.sav", to.data.frame = TRUE)
data = ESS5e03
savevars <- c("eduyrs", "agea", "agertr", "gndr",
"cntry", "icmnact", "hinctnta")
ESS1 <- data[savevars]
rm(data, savevars)
ESS2 = ESS1 %>%
select(eduyrs, agea, agertr, gndr, cntry, icmnact, hinctnta)
Смотрим на датасет
str(ESS2)
## Classes 'tbl_df', 'tbl' and 'data.frame': 52458 obs. of 7 variables:
## $ eduyrs : 'haven_labelled' num 15 15 13 15 15 13 17 13 12 12 ...
## ..- attr(*, "label")= chr "Years of full-time education completed"
## ..- attr(*, "format.spss")= chr "F2.0"
## ..- attr(*, "labels")= Named num 77 88 99
## .. ..- attr(*, "names")= chr "Refusal" "Don't know" "No answer"
## $ agea : 'haven_labelled' num 22 43 19 23 58 62 26 40 48 68 ...
## ..- attr(*, "label")= chr "Age of respondent, calculated"
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "labels")= Named num 999
## .. ..- attr(*, "names")= chr "Not available"
## $ agertr : 'haven_labelled' num NA NA NA NA 60 60 NA NA 58 61 ...
## ..- attr(*, "label")= chr "What age you would like to/would have liked to retire"
## ..- attr(*, "format.spss")= chr "F3.0"
## ..- attr(*, "labels")= Named num 666 777 888 999
## .. ..- attr(*, "names")= chr "Not applicable" "Refusal" "Don't know" "No answer"
## $ gndr : 'haven_labelled' num 1 1 2 2 2 2 1 1 1 1 ...
## ..- attr(*, "label")= chr "Gender"
## ..- attr(*, "format.spss")= chr "F1.0"
## ..- attr(*, "display_width")= int 6
## ..- attr(*, "labels")= Named num 1 2 9
## .. ..- attr(*, "names")= chr "Male" "Female" "No answer"
## $ cntry : 'haven_labelled' chr "BE" "BE" "BE" "BE" ...
## ..- attr(*, "label")= chr "Country"
## ..- attr(*, "format.spss")= chr "A2"
## ..- attr(*, "display_width")= int 7
## ..- attr(*, "labels")= Named chr "UA" "GB" "BE" "DE" ...
## .. ..- attr(*, "names")= chr "Ukraine" "United Kingdom" "Belgium" "Germany" ...
## $ icmnact : 'haven_labelled' num 3 1 3 1 3 2 1 1 1 2 ...
## ..- attr(*, "label")= chr "Interviewer code, main activity of respondent"
## ..- attr(*, "format.spss")= chr "F1.0"
## ..- attr(*, "display_width")= int 9
## ..- attr(*, "labels")= Named num 1 2 3 9
## .. ..- attr(*, "names")= chr "In paid work" "Retired" "All others" "Not available"
## $ hinctnta: 'haven_labelled' num NA 5 1 10 6 5 6 2 6 8 ...
## ..- attr(*, "label")= chr "Household's total net income, all sources"
## ..- attr(*, "format.spss")= chr "F2.0"
## ..- attr(*, "display_width")= int 10
## ..- attr(*, "labels")= Named num 1 2 3 4 5 6 7 8 9 10 ...
## .. ..- attr(*, "names")= chr "J" "R" "C" "M" ...
summary(ESS2)
## eduyrs agea agertr gndr
## Min. : 0.00 Min. : 14.00 Min. : 0.00 Min. :1.000
## 1st Qu.:10.00 1st Qu.: 33.00 1st Qu.: 56.00 1st Qu.:1.000
## Median :12.00 Median : 48.00 Median : 60.00 Median :2.000
## Mean :12.29 Mean : 48.51 Mean : 60.04 Mean :1.546
## 3rd Qu.:15.00 3rd Qu.: 63.00 3rd Qu.: 64.00 3rd Qu.:2.000
## Max. :55.00 Max. :102.00 Max. :120.00 Max. :2.000
## NA's :629 NA's :153 NA's :27902 NA's :21
## cntry icmnact hinctnta
## Length:52458 Min. :1.000 Min. : 1.000
## Class :haven_labelled 1st Qu.:1.000 1st Qu.: 3.000
## Mode :character Median :2.000 Median : 5.000
## Mean :1.812 Mean : 5.049
## 3rd Qu.:3.000 3rd Qu.: 7.000
## Max. :3.000 Max. :10.000
## NA's :134 NA's :12620
ESS2$eduyrs = as.numeric(as.character(ESS2$eduyrs))
ESS2$agertr = as.numeric(as.character(ESS2$agertr))
ESS2$agea = as.numeric(as.character(ESS2$agea))
ESS2$hinctnta<-as.numeric(ESS1$hinctnta)
А здесь табличка!
В графике ниже мы смотрим сразу на всё:
как распределены количественные переменные: в 1 и 3 случае это вроде называется univariate distribution (датаразбита по пяти столбикам, все одинаковой длины) И вот тут описание наших недогистограмм
коэффициент корреляции
поскольку корреляция кажется хуевенькой, мы также сразу можем посмотреть на числовые значения. И вот тут сделать выводы по ним
Это же просто гениально, можно не строить, мать его, 10 графиков!!!!!!!!!!!!*
my_fn1 <- function(data, mapping, ...){
p1 <- ggplot(data = ESS2, mapping = mapping) +
geom_point() +
geom_smooth(method=lm, fill="blue", color="blue", ...)
p1
}
library(GGally)
g1 = ggpairs(ESS2,columns = c(1,2,3,7), lower = list(continuous = my_fn1), title= "1")
g1
If X1 and X2 interact, this means that the effect of X1 on Y depends on the value of X2, and vice verca (с) скоммунизджено
ESS4 <- na.omit(subset(ESS2, icmnact==1))
m1 <- lm(agertr ~ agea, data = ESS4)
summary(m1)
##
## Call:
## lm(formula = agertr ~ agea, data = ESS4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.250 -2.506 0.195 2.505 58.728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.33278 0.56274 84.11 <2e-16 ***
## agea 0.24455 0.01035 23.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.169 on 7425 degrees of freedom
## Multiple R-squared: 0.06996, Adjusted R-squared: 0.06983
## F-statistic: 558.5 on 1 and 7425 DF, p-value: < 2.2e-16
m2 <- lm(agertr ~ agea + gndr, data = ESS4)
summary(m2)
##
## Call:
## lm(formula = agertr ~ agea + gndr, data = ESS4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.962 -2.723 -0.002 2.384 57.993
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.92842 0.59212 84.32 <2e-16 ***
## agea 0.23869 0.01025 23.30 <2e-16 ***
## gndr -1.52691 0.11879 -12.85 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.113 on 7424 degrees of freedom
## Multiple R-squared: 0.09021, Adjusted R-squared: 0.08996
## F-statistic: 368 on 2 and 7424 DF, p-value: < 2.2e-16
anova(m1, m2)
## Analysis of Variance Table
##
## Model 1: agertr ~ agea
## Model 2: agertr ~ agea + gndr
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7425 198412
## 2 7424 194093 1 4319.4 165.21 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m3 <- lm(agertr ~ agea + gndr + hinctnta, data = ESS4)
summary(m3)
##
## Call:
## lm(formula = agertr ~ agea + gndr + hinctnta, data = ESS4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.321 -2.667 0.033 2.424 57.785
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.71478 0.62708 77.685 < 2e-16 ***
## agea 0.24357 0.01026 23.745 < 2e-16 ***
## gndr -1.44434 0.11939 -12.097 < 2e-16 ***
## hinctnta 0.13259 0.02295 5.776 7.95e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.102 on 7423 degrees of freedom
## Multiple R-squared: 0.09428, Adjusted R-squared: 0.09391
## F-statistic: 257.6 on 3 and 7423 DF, p-value: < 2.2e-16
anova(m2, m3)
## Analysis of Variance Table
##
## Model 1: agertr ~ agea + gndr
## Model 2: agertr ~ agea + gndr + hinctnta
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7424 194093
## 2 7423 193224 1 868.5 33.365 7.949e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m4 <- lm(agertr ~ agea + gndr + hinctnta + eduyrs, data = ESS4)
summary(m4)
##
## Call:
## lm(formula = agertr ~ agea + gndr + hinctnta + eduyrs, data = ESS4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.964 -2.634 0.073 2.459 57.756
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.26782 0.65084 72.626 < 2e-16 ***
## agea 0.24686 0.01022 24.145 < 2e-16 ***
## gndr -1.52126 0.11930 -12.751 < 2e-16 ***
## hinctnta 0.07234 0.02410 3.002 0.0027 **
## eduyrs 0.13201 0.01672 7.896 3.31e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.081 on 7422 degrees of freedom
## Multiple R-squared: 0.1018, Adjusted R-squared: 0.1013
## F-statistic: 210.3 on 4 and 7422 DF, p-value: < 2.2e-16
anova(m3, m4)
## Analysis of Variance Table
##
## Model 1: agertr ~ agea + gndr + hinctnta
## Model 2: agertr ~ agea + gndr + hinctnta + eduyrs
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 7423 193224
## 2 7422 191615 1 1609.4 62.339 3.307e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Я предлагаю посмотреть только на самую интересную модель с кучей предикторов, чтобы типа еще раз попрактироваться.
par(mfrow = c(2, 2))
plot(m4)