title: ‘22041’ output: html_document: default —

library(dplyr)
library(ggplot2)
library(stringr)
library(readr)
library(sjlabelled)
library(sjmisc) 
library(sjstats) 
library(ggeffects)  
library(sjPlot)
library(knitr)
library(psych)
library(magrittr)
library(kableExtra)
library(tidytext)
library(RColorBrewer)
library(haven)
df <- read_csv("D:/Documents/data1_1.csv")
ESS5e03 <- read_sav("D:/Documents/ESS5e03.sav")
# Read the literature and jutify what you do and want to expect
# Draw scatterplots to make sure further analysis is necessary
# Estimate the regression model
# Test the overall quality of the regression model
# Interpret regression coefficients (point estimates andconfidence intervals)
# Compare the strength of association with standardized coefficients
# Predict the outcome

df$books = as.numeric(as.character(df$books))
df$attend = as.numeric(as.character(df$attend))
df$grade = as.numeric(as.character(df$grade))

Have a look at the data we have

str(df)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 40 obs. of  3 variables:
##  $ books : num  0 1 0 2 4 4 1 4 3 0 ...
##  $ attend: num  9 15 10 16 10 20 11 20 15 15 ...
##  $ grade : num  45 57 45 51 65 88 44 87 89 59 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   books = col_double(),
##   ..   attend = col_double(),
##   ..   grade = col_double()
##   .. )
summary(df)
##      books       attend          grade      
##  Min.   :0   Min.   : 6.00   Min.   :37.00  
##  1st Qu.:1   1st Qu.:10.00   1st Qu.:51.00  
##  Median :2   Median :15.00   Median :60.50  
##  Mean   :2   Mean   :14.10   Mean   :63.55  
##  3rd Qu.:3   3rd Qu.:18.25   3rd Qu.:73.00  
##  Max.   :4   Max.   :20.00   Max.   :97.00

Using BoxPlot To Check For Outliers

par(mfrow=c(1, 3))
boxplot(df$books, main="books", sub=paste("Outlier rows: ", boxplot.stats(df$books)$out))
boxplot(df$attend, main="attend", sub=paste("Outlier rows: ", boxplot.stats(df$attend)$out))
boxplot(df$grade, main="grade", sub=paste("Outlier rows: ", boxplot.stats(df$grade)$out))

Using Density Plots To Check If Variable Is Close To Normal

library(cowplot)
## 
## Attaching package: 'cowplot'
## The following objects are masked from 'package:sjPlot':
## 
##     plot_grid, save_plot
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
bo = ggplot(data = df, aes(x = books))  + geom_density(col = "pink", fill = "pink", alpha = 0.5)
at = ggplot(data = df, aes(x = attend)) + geom_density(col = "blue", fill = "blue", alpha = 0.5)
gr = ggplot(data = df, aes(x = grade)) + geom_density(col = "yellow", fill = "yellow", alpha = 0.5)

plot_grid(bo, at, gr)

Using Scatter Plot To Visualise The Relationship

sc1 = ggplot(data = df, aes(x = books, y = grade))  + geom_point() + geom_smooth(method = lm, fill="blue", color="blue", se = FALSE)
sc2 = ggplot(data = df, aes(x = attend, y = grade)) + geom_point() + geom_smooth(method = lm, fill="blue", color="blue", se = FALSE)
sc3 = ggplot(data = df, aes(x = attend, y = books)) + geom_point() + geom_smooth(method=lm, fill="blue", color="blue", se = FALSE)

plot_grid(sc1, sc2, sc3)

Look at correlation coefficients

q = cor(df)
sjp.corr(df, show.legend = TRUE)
## Computing correlation using pearson-method with listwise-deletion...
## Warning: Removed 6 rows containing missing values (geom_text).


Делаем модели

model1 = lm(grade ~ books, data = df)
model2 = lm(grade ~ attend, data = df)
model3 = lm(grade ~ books + attend, data = df)
library(snakecase)
sjPlot::tab_model(model1)
  grade
Predictors Estimates CI p
(Intercept) 52.07 44.17 – 59.98 <0.001
books 5.74 2.51 – 8.97 0.001
Observations 40
R2 / adjusted R2 0.242 / 0.222
sjPlot::tab_model(model2)
  grade
Predictors Estimates CI p
(Intercept) 37.00 20.99 – 53.01 <0.001
attend 1.88 0.80 – 2.97 0.002
Observations 40
R2 / adjusted R2 0.233 / 0.212
sjPlot::tab_model(model3)
  grade
Predictors Estimates CI p
(Intercept) 37.38 22.20 – 52.56 <0.001
books 4.04 0.60 – 7.47 0.027
attend 1.28 0.13 – 2.43 0.035
Observations 40
R2 / adjusted R2 0.329 / 0.292
# the algorithm for summary2
#F-statistics to look at P-value (PR(>F)) - if significant, the model is OK
#Adjusted R squared: to look how good the model is: it is : how many percent the first variable explains the second 
#Estimate: the intercept if the variable if equal to 0. In the case of grade and attend(2), if we do not atend classes we receive 37. If we attend 2 classes, it will be 37 + 2*1.883

Comparing Models

#Compare models. They should be Nested models. 
anova(model1, model3)
## Analysis of Variance Table
## 
## Model 1: grade ~ books
## Model 2: grade ~ books + attend
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     38 8250.4                              
## 2     37 7306.2  1    944.16 4.7814 0.03517 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model2, model3)
## Analysis of Variance Table
## 
## Model 1: grade ~ attend
## Model 2: grade ~ books + attend
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     38 8353.4                              
## 2     37 7306.2  1    1047.1 5.3028 0.02701 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Checking Linear Regression Assumptions

There is a helpful LINE mnemonic used for the Regression Assumptions:

The independence assumption is held because… Others can be checked by looking at the residuals of the models.

par(mfrow = c(2, 2))
plot(model1)

а потом еще две модели)))0 ** но я не уверена, что надо рассматривать все!!! мб только одной ограничиться? хотя хз, короче**

par(mfrow = c(2, 2))
plot(model2)

выводы 2

par(mfrow = c(2, 2))
plot(model3)

выводы 3

Second regression

Здесь шота про бэкграунд

#data <- read_sav("ESS5e03.sav",  to.data.frame = TRUE)
data = ESS5e03
savevars <- c("eduyrs", "agea", "agertr", "gndr", 
              "cntry", "icmnact", "hinctnta")
ESS1 <- data[savevars]
rm(data, savevars)

Describing variables

ESS2 = ESS1 %>% 
  select(eduyrs, agea, agertr, gndr, cntry, icmnact, hinctnta)

Смотрим на датасет

str(ESS2)
## Classes 'tbl_df', 'tbl' and 'data.frame':    52458 obs. of  7 variables:
##  $ eduyrs  : 'haven_labelled' num  15 15 13 15 15 13 17 13 12 12 ...
##   ..- attr(*, "label")= chr "Years of full-time education completed"
##   ..- attr(*, "format.spss")= chr "F2.0"
##   ..- attr(*, "labels")= Named num  77 88 99
##   .. ..- attr(*, "names")= chr  "Refusal" "Don't know" "No answer"
##  $ agea    : 'haven_labelled' num  22 43 19 23 58 62 26 40 48 68 ...
##   ..- attr(*, "label")= chr "Age of respondent, calculated"
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "labels")= Named num 999
##   .. ..- attr(*, "names")= chr "Not available"
##  $ agertr  : 'haven_labelled' num  NA NA NA NA 60 60 NA NA 58 61 ...
##   ..- attr(*, "label")= chr "What age you would like to/would have liked to retire"
##   ..- attr(*, "format.spss")= chr "F3.0"
##   ..- attr(*, "labels")= Named num  666 777 888 999
##   .. ..- attr(*, "names")= chr  "Not applicable" "Refusal" "Don't know" "No answer"
##  $ gndr    : 'haven_labelled' num  1 1 2 2 2 2 1 1 1 1 ...
##   ..- attr(*, "label")= chr "Gender"
##   ..- attr(*, "format.spss")= chr "F1.0"
##   ..- attr(*, "display_width")= int 6
##   ..- attr(*, "labels")= Named num  1 2 9
##   .. ..- attr(*, "names")= chr  "Male" "Female" "No answer"
##  $ cntry   : 'haven_labelled' chr  "BE" "BE" "BE" "BE" ...
##   ..- attr(*, "label")= chr "Country"
##   ..- attr(*, "format.spss")= chr "A2"
##   ..- attr(*, "display_width")= int 7
##   ..- attr(*, "labels")= Named chr  "UA" "GB" "BE" "DE" ...
##   .. ..- attr(*, "names")= chr  "Ukraine" "United Kingdom" "Belgium" "Germany" ...
##  $ icmnact : 'haven_labelled' num  3 1 3 1 3 2 1 1 1 2 ...
##   ..- attr(*, "label")= chr "Interviewer code, main activity of respondent"
##   ..- attr(*, "format.spss")= chr "F1.0"
##   ..- attr(*, "display_width")= int 9
##   ..- attr(*, "labels")= Named num  1 2 3 9
##   .. ..- attr(*, "names")= chr  "In paid work" "Retired" "All others" "Not available"
##  $ hinctnta: 'haven_labelled' num  NA 5 1 10 6 5 6 2 6 8 ...
##   ..- attr(*, "label")= chr "Household's total net income, all sources"
##   ..- attr(*, "format.spss")= chr "F2.0"
##   ..- attr(*, "display_width")= int 10
##   ..- attr(*, "labels")= Named num  1 2 3 4 5 6 7 8 9 10 ...
##   .. ..- attr(*, "names")= chr  "J" "R" "C" "M" ...
summary(ESS2)
##      eduyrs           agea            agertr            gndr      
##  Min.   : 0.00   Min.   : 14.00   Min.   :  0.00   Min.   :1.000  
##  1st Qu.:10.00   1st Qu.: 33.00   1st Qu.: 56.00   1st Qu.:1.000  
##  Median :12.00   Median : 48.00   Median : 60.00   Median :2.000  
##  Mean   :12.29   Mean   : 48.51   Mean   : 60.04   Mean   :1.546  
##  3rd Qu.:15.00   3rd Qu.: 63.00   3rd Qu.: 64.00   3rd Qu.:2.000  
##  Max.   :55.00   Max.   :102.00   Max.   :120.00   Max.   :2.000  
##  NA's   :629     NA's   :153      NA's   :27902    NA's   :21     
##     cntry                   icmnact         hinctnta     
##  Length:52458            Min.   :1.000   Min.   : 1.000  
##  Class :haven_labelled   1st Qu.:1.000   1st Qu.: 3.000  
##  Mode  :character        Median :2.000   Median : 5.000  
##                          Mean   :1.812   Mean   : 5.049  
##                          3rd Qu.:3.000   3rd Qu.: 7.000  
##                          Max.   :3.000   Max.   :10.000  
##                          NA's   :134     NA's   :12620
ESS2$eduyrs = as.numeric(as.character(ESS2$eduyrs))
ESS2$agertr = as.numeric(as.character(ESS2$agertr))
ESS2$agea = as.numeric(as.character(ESS2$agea))
ESS2$hinctnta<-as.numeric(ESS1$hinctnta)

А здесь табличка!


Graphs for all variables

В графике ниже мы смотрим сразу на всё:

my_fn1 <- function(data, mapping, ...){
  p1 <- ggplot(data = ESS2, mapping = mapping) + 
    geom_point() + 
    geom_smooth(method=lm, fill="blue", color="blue", ...)
  p1
}
library(GGally)
g1 = ggpairs(ESS2,columns = c(1,2,3,7), lower = list(continuous = my_fn1), title= "1")
g1


Multiple Linear Regression

If X1 and X2 interact, this means that the effect of X1 on Y depends on the value of X2, and vice verca (с) скоммунизджено

ESS4 <- na.omit(subset(ESS2, icmnact==1))
m1 <- lm(agertr ~ agea, data = ESS4)
summary(m1)
## 
## Call:
## lm(formula = agertr ~ agea, data = ESS4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.250  -2.506   0.195   2.505  58.728 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 47.33278    0.56274   84.11   <2e-16 ***
## agea         0.24455    0.01035   23.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.169 on 7425 degrees of freedom
## Multiple R-squared:  0.06996,    Adjusted R-squared:  0.06983 
## F-statistic: 558.5 on 1 and 7425 DF,  p-value: < 2.2e-16
m2 <- lm(agertr ~ agea + gndr, data = ESS4)
summary(m2)
## 
## Call:
## lm(formula = agertr ~ agea + gndr, data = ESS4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.962  -2.723  -0.002   2.384  57.993 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 49.92842    0.59212   84.32   <2e-16 ***
## agea         0.23869    0.01025   23.30   <2e-16 ***
## gndr        -1.52691    0.11879  -12.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.113 on 7424 degrees of freedom
## Multiple R-squared:  0.09021,    Adjusted R-squared:  0.08996 
## F-statistic:   368 on 2 and 7424 DF,  p-value: < 2.2e-16
anova(m1, m2)
## Analysis of Variance Table
## 
## Model 1: agertr ~ agea
## Model 2: agertr ~ agea + gndr
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7425 198412                                  
## 2   7424 194093  1    4319.4 165.21 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m3 <- lm(agertr ~ agea + gndr + hinctnta, data = ESS4)
summary(m3)
## 
## Call:
## lm(formula = agertr ~ agea + gndr + hinctnta, data = ESS4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.321  -2.667   0.033   2.424  57.785 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 48.71478    0.62708  77.685  < 2e-16 ***
## agea         0.24357    0.01026  23.745  < 2e-16 ***
## gndr        -1.44434    0.11939 -12.097  < 2e-16 ***
## hinctnta     0.13259    0.02295   5.776 7.95e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.102 on 7423 degrees of freedom
## Multiple R-squared:  0.09428,    Adjusted R-squared:  0.09391 
## F-statistic: 257.6 on 3 and 7423 DF,  p-value: < 2.2e-16
anova(m2, m3)
## Analysis of Variance Table
## 
## Model 1: agertr ~ agea + gndr
## Model 2: agertr ~ agea + gndr + hinctnta
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7424 194093                                  
## 2   7423 193224  1     868.5 33.365 7.949e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m4 <- lm(agertr ~ agea + gndr + hinctnta + eduyrs, data = ESS4)
summary(m4)
## 
## Call:
## lm(formula = agertr ~ agea + gndr + hinctnta + eduyrs, data = ESS4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.964  -2.634   0.073   2.459  57.756 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 47.26782    0.65084  72.626  < 2e-16 ***
## agea         0.24686    0.01022  24.145  < 2e-16 ***
## gndr        -1.52126    0.11930 -12.751  < 2e-16 ***
## hinctnta     0.07234    0.02410   3.002   0.0027 ** 
## eduyrs       0.13201    0.01672   7.896 3.31e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.081 on 7422 degrees of freedom
## Multiple R-squared:  0.1018, Adjusted R-squared:  0.1013 
## F-statistic: 210.3 on 4 and 7422 DF,  p-value: < 2.2e-16
anova(m3, m4)
## Analysis of Variance Table
## 
## Model 1: agertr ~ agea + gndr + hinctnta
## Model 2: agertr ~ agea + gndr + hinctnta + eduyrs
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7423 193224                                  
## 2   7422 191615  1    1609.4 62.339 3.307e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Checking Linear Regression Assumptions

Я предлагаю посмотреть только на самую интересную модель с кучей предикторов, чтобы типа еще раз попрактироваться.

par(mfrow = c(2, 2))
plot(m4)