title: ‘22041’ output: html_document: default —

library(dplyr)
library(ggplot2)
library(stringr)
library(readr)
library(sjlabelled)
library(sjmisc) 
library(sjstats) 
library(ggeffects)  
library(sjPlot)
library(knitr)
library(psych)
library(magrittr)
library(kableExtra)
library(tidytext)
library(RColorBrewer)
library(haven)

df <- read_csv("D:/Documents/data1_1.csv")
ESS5e03 <- read_sav("D:/Documents/ESS5e03.sav")

# Read the literature and jutify what you do and want to expect
# Draw scatterplots to make sure further analysis is necessary
# Estimate the regression model
# Test the overall quality of the regression model
# Interpret regression coefficients (point estimates andconfidence intervals)
# Compare the strength of association with standardized coefficients
# Predict the outcome

df$books = as.numeric(as.character(df$books))
df$attend = as.numeric(as.character(df$attend))
df$grade = as.numeric(as.character(df$grade))

Have a look at the data we have

str(df)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 40 obs. of  3 variables:
##  $ books : num  0 1 0 2 4 4 1 4 3 0 ...
##  $ attend: num  9 15 10 16 10 20 11 20 15 15 ...
##  $ grade : num  45 57 45 51 65 88 44 87 89 59 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   books = col_double(),
##   ..   attend = col_double(),
##   ..   grade = col_double()
##   .. )

summary(df)

##      books       attend          grade      
##  Min.   :0   Min.   : 6.00   Min.   :37.00  
##  1st Qu.:1   1st Qu.:10.00   1st Qu.:51.00  
##  Median :2   Median :15.00   Median :60.50  
##  Mean   :2   Mean   :14.10   Mean   :63.55  
##  3rd Qu.:3   3rd Qu.:18.25   3rd Qu.:73.00  
##  Max.   :4   Max.   :20.00   Max.   :97.00

Using BoxPlot To Check For Outliers

par(mfrow=c(1, 3))
boxplot(df$books, main="books", sub=paste("Outlier rows: ", boxplot.stats(df$books)$out))
boxplot(df$attend, main="attend", sub=paste("Outlier rows: ", boxplot.stats(df$attend)$out))
boxplot(df$grade, main="grade", sub=paste("Outlier rows: ", boxplot.stats(df$grade)$out))

Using Density Plots To Check If Variable Is Close To Normal

library(cowplot)

## 
## Attaching package: 'cowplot'

## The following objects are masked from 'package:sjPlot':
## 
##     plot_grid, save_plot

## The following object is masked from 'package:ggplot2':
## 
##     ggsave

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

bo = ggplot(data = df, aes(x = books))  + geom_density(col = "pink", fill = "pink", alpha = 0.5)
at = ggplot(data = df, aes(x = attend)) + geom_density(col = "blue", fill = "blue", alpha = 0.5)
gr = ggplot(data = df, aes(x = grade)) + geom_density(col = "yellow", fill = "yellow", alpha = 0.5)

plot_grid(bo, at, gr)

Using Scatter Plot To Visualise The Relationship

sc1 = ggplot(data = df, aes(x = books, y = grade))  + geom_point() + geom_smooth(method = lm, fill="blue", color="blue", se = FALSE)
sc2 = ggplot(data = df, aes(x = attend, y = grade)) + geom_point() + geom_smooth(method = lm, fill="blue", color="blue", se = FALSE)
sc3 = ggplot(data = df, aes(x = attend, y = books)) + geom_point() + geom_smooth(method=lm, fill="blue", color="blue", se = FALSE)

plot_grid(sc1, sc2, sc3)

Look at correlation coefficients

q = cor(df)
sjp.corr(df, show.legend = TRUE)

## Computing correlation using pearson-method with listwise-deletion...

## Warning: Removed 6 rows containing missing values (geom_text).

Делаем модели

model1 = lm(grade ~ books, data = df)
model2 = lm(grade ~ attend, data = df)
model3 = lm(grade ~ books + attend, data = df)

Смотрим саммери
Сначала смотрим на F-Ratio, проверяем, что P-value (PR(>F)) is significant
Потом смотрим на Adjusted R squared - the share of variance of outcome variable that explains the model: from 0 to 1. It shows how good the model is.
Дальше смотрим на Estimate: intercept здесь определение и slope здесь тоже определение.

library(snakecase)
sjPlot::tab_model(model1)

	grade
Predictors	Estimates	CI	p
(Intercept)	52.07	44.17 – 59.98	<0.001
books	5.74	2.51 – 8.97	0.001
Observations	40
R² / adjusted R²	0.242 / 0.222

sjPlot::tab_model(model2)

	grade
Predictors	Estimates	CI	p
(Intercept)	37.00	20.99 – 53.01	<0.001
attend	1.88	0.80 – 2.97	0.002
Observations	40
R² / adjusted R²	0.233 / 0.212

sjPlot::tab_model(model3)

	grade
Predictors	Estimates	CI	p
(Intercept)	37.38	22.20 – 52.56	<0.001
books	4.04	0.60 – 7.47	0.027
attend	1.28	0.13 – 2.43	0.035
Observations	40
R² / adjusted R²	0.329 / 0.292

# the algorithm for summary2
#F-statistics to look at P-value (PR(>F)) - if significant, the model is OK
#Adjusted R squared: to look how good the model is: it is : how many percent the first variable explains the second 
#Estimate: the intercept if the variable if equal to 0. In the case of grade and attend(2), if we do not atend classes we receive 37. If we attend 2 classes, it will be 37 + 2*1.883

Comparing Models

#Compare models. They should be Nested models. 
anova(model1, model3)

## Analysis of Variance Table
## 
## Model 1: grade ~ books
## Model 2: grade ~ books + attend
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     38 8250.4                              
## 2     37 7306.2  1    944.16 4.7814 0.03517 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(model2, model3)

## Analysis of Variance Table
## 
## Model 1: grade ~ attend
## Model 2: grade ~ books + attend
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     38 8353.4                              
## 2     37 7306.2  1    1047.1 5.3028 0.02701 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Checking Linear Regression Assumptions

There is a helpful LINE mnemonic used for the Regression Assumptions:

L- linearity- the Y-values can be expressed as a linear function of X-variable
I- independence - the errors are independent
N- normality - values are normally distributed
E- equal variance (homoscedasticity) - variation of observations around the regression line is constant

The independence assumption is held because… Others can be checked by looking at the residuals of the models.

par(mfrow = c(2, 2))
plot(model1)

For linearity assumption: look at Residuals vs.Fitted plot. There are predicted Y-values displayed on x-axis and Residuals (or errors) on y-axis. If the assumption is met, the red line should be fairly flat (перефразируй) . Выводы по нашей картинке
For normality assumption: look at Q-Q plot. If residuals (or Y-values) are normally distributed, the points should fall roughly on the diagonal line ** перефраз. Выводы**
For homoscedasticity assumption: look at Scale-Location & Residuals vs. Leverage plots to identify non-linearities, non-constant variance and troublesome observations. Этого я не поняла, надо загуглить и уточнить

а потом еще две модели)))0 ** но я не уверена, что надо рассматривать все!!! мб только одной ограничиться? хотя хз, короче**

par(mfrow = c(2, 2))
plot(model2)

выводы 2

par(mfrow = c(2, 2))
plot(model3)

выводы 3

Second regression

Здесь шота про бэкграунд

#data <- read_sav("ESS5e03.sav",  to.data.frame = TRUE)
data = ESS5e03
savevars <- c("eduyrs", "agea", "agertr", "gndr", 
              "cntry", "icmnact", "hinctnta")
ESS1 <- data[savevars]
rm(data, savevars)

Describing variables

ESS2 = ESS1 %>% 
  select(eduyrs, agea, agertr, gndr, cntry, icmnact, hinctnta)

Смотрим на датасет

str(ESS2)

## Classes 'tbl_df', 'tbl' and 'data.frame':    52458 obs. of  7 variables:
##  $ eduyrs  : 'haven_labelled' num  15 15 13 15 15 13 17 13 12 12 ...
##   ..- attr(*, "label")= chr "Years of full-time education completed"
##   ..- attr(*, "format.spss")= chr "F2.0"
##   ..- attr(*, "labels")= Named num  77 88 99
##   .. ..- attr(*, "names")= chr  "Refusal" "Don't know" "No answer"
##  $ agea    : 'haven_labelled' num  22 43 19 23 58 62 26 40 48 68 ...
##   ..- attr(*, "label")= chr "Age of respondent, calculated"
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "labels")= Named num 999
##   .. ..- attr(*, "names")= chr "Not available"
##  $ agertr  : 'haven_labelled' num  NA NA NA NA 60 60 NA NA 58 61 ...
##   ..- attr(*, "label")= chr "What age you would like to/would have liked to retire"
##   ..- attr(*, "format.spss")= chr "F3.0"
##   ..- attr(*, "labels")= Named num  666 777 888 999
##   .. ..- attr(*, "names")= chr  "Not applicable" "Refusal" "Don't know" "No answer"
##  $ gndr    : 'haven_labelled' num  1 1 2 2 2 2 1 1 1 1 ...
##   ..- attr(*, "label")= chr "Gender"
##   ..- attr(*, "format.spss")= chr "F1.0"
##   ..- attr(*, "display_width")= int 6
##   ..- attr(*, "labels")= Named num  1 2 9
##   .. ..- attr(*, "names")= chr  "Male" "Female" "No answer"
##  $ cntry   : 'haven_labelled' chr  "BE" "BE" "BE" "BE" ...
##   ..- attr(*, "label")= chr "Country"
##   ..- attr(*, "format.spss")= chr "A2"
##   ..- attr(*, "display_width")= int 7
##   ..- attr(*, "labels")= Named chr  "UA" "GB" "BE" "DE" ...
##   .. ..- attr(*, "names")= chr  "Ukraine" "United Kingdom" "Belgium" "Germany" ...
##  $ icmnact : 'haven_labelled' num  3 1 3 1 3 2 1 1 1 2 ...
##   ..- attr(*, "label")= chr "Interviewer code, main activity of respondent"
##   ..- attr(*, "format.spss")= chr "F1.0"
##   ..- attr(*, "display_width")= int 9
##   ..- attr(*, "labels")= Named num  1 2 3 9
##   .. ..- attr(*, "names")= chr  "In paid work" "Retired" "All others" "Not available"
##  $ hinctnta: 'haven_labelled' num  NA 5 1 10 6 5 6 2 6 8 ...
##   ..- attr(*, "label")= chr "Household's total net income, all sources"
##   ..- attr(*, "format.spss")= chr "F2.0"
##   ..- attr(*, "display_width")= int 10
##   ..- attr(*, "labels")= Named num  1 2 3 4 5 6 7 8 9 10 ...
##   .. ..- attr(*, "names")= chr  "J" "R" "C" "M" ...

summary(ESS2)

##      eduyrs           agea            agertr            gndr      
##  Min.   : 0.00   Min.   : 14.00   Min.   :  0.00   Min.   :1.000  
##  1st Qu.:10.00   1st Qu.: 33.00   1st Qu.: 56.00   1st Qu.:1.000  
##  Median :12.00   Median : 48.00   Median : 60.00   Median :2.000  
##  Mean   :12.29   Mean   : 48.51   Mean   : 60.04   Mean   :1.546  
##  3rd Qu.:15.00   3rd Qu.: 63.00   3rd Qu.: 64.00   3rd Qu.:2.000  
##  Max.   :55.00   Max.   :102.00   Max.   :120.00   Max.   :2.000  
##  NA's   :629     NA's   :153      NA's   :27902    NA's   :21     
##     cntry                   icmnact         hinctnta     
##  Length:52458            Min.   :1.000   Min.   : 1.000  
##  Class :haven_labelled   1st Qu.:1.000   1st Qu.: 3.000  
##  Mode  :character        Median :2.000   Median : 5.000  
##                          Mean   :1.812   Mean   : 5.049  
##                          3rd Qu.:3.000   3rd Qu.: 7.000  
##                          Max.   :3.000   Max.   :10.000  
##                          NA's   :134     NA's   :12620

ESS2$eduyrs = as.numeric(as.character(ESS2$eduyrs))
ESS2$agertr = as.numeric(as.character(ESS2$agertr))
ESS2$agea = as.numeric(as.character(ESS2$agea))
ESS2$hinctnta<-as.numeric(ESS1$hinctnta)

А здесь табличка!

Graphs for all variables

В графике ниже мы смотрим сразу на всё:

как распределены количественные переменные: в 1 и 3 случае это вроде называется univariate distribution (датаразбита по пяти столбикам, все одинаковой длины) И вот тут описание наших недогистограмм
коэффициент корреляции
поскольку корреляция кажется хуевенькой, мы также сразу можем посмотреть на числовые значения. И вот тут сделать выводы по ним
Это же просто гениально, можно не строить, мать его, 10 графиков!!!!!!!!!!!!*

my_fn1 <- function(data, mapping, ...){
  p1 <- ggplot(data = ESS2, mapping = mapping) + 
    geom_point() + 
    geom_smooth(method=lm, fill="blue", color="blue", ...)
  p1
}
library(GGally)
g1 = ggpairs(ESS2,columns = c(1,2,3,7), lower = list(continuous = my_fn1), title= "1")
g1

Multiple Linear Regression

If X1 and X2 interact, this means that the effect of X1 on Y depends on the value of X2, and vice verca (с) скоммунизджено

ESS4 <- na.omit(subset(ESS2, icmnact==1))

m1 <- lm(agertr ~ agea, data = ESS4)
summary(m1)

## 
## Call:
## lm(formula = agertr ~ agea, data = ESS4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.250  -2.506   0.195   2.505  58.728 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 47.33278    0.56274   84.11   <2e-16 ***
## agea         0.24455    0.01035   23.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.169 on 7425 degrees of freedom
## Multiple R-squared:  0.06996,    Adjusted R-squared:  0.06983 
## F-statistic: 558.5 on 1 and 7425 DF,  p-value: < 2.2e-16

m2 <- lm(agertr ~ agea + gndr, data = ESS4)
summary(m2)

## 
## Call:
## lm(formula = agertr ~ agea + gndr, data = ESS4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.962  -2.723  -0.002   2.384  57.993 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 49.92842    0.59212   84.32   <2e-16 ***
## agea         0.23869    0.01025   23.30   <2e-16 ***
## gndr        -1.52691    0.11879  -12.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.113 on 7424 degrees of freedom
## Multiple R-squared:  0.09021,    Adjusted R-squared:  0.08996 
## F-statistic:   368 on 2 and 7424 DF,  p-value: < 2.2e-16

anova(m1, m2)

## Analysis of Variance Table
## 
## Model 1: agertr ~ agea
## Model 2: agertr ~ agea + gndr
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7425 198412                                  
## 2   7424 194093  1    4319.4 165.21 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

m3 <- lm(agertr ~ agea + gndr + hinctnta, data = ESS4)
summary(m3)

## 
## Call:
## lm(formula = agertr ~ agea + gndr + hinctnta, data = ESS4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.321  -2.667   0.033   2.424  57.785 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 48.71478    0.62708  77.685  < 2e-16 ***
## agea         0.24357    0.01026  23.745  < 2e-16 ***
## gndr        -1.44434    0.11939 -12.097  < 2e-16 ***
## hinctnta     0.13259    0.02295   5.776 7.95e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.102 on 7423 degrees of freedom
## Multiple R-squared:  0.09428,    Adjusted R-squared:  0.09391 
## F-statistic: 257.6 on 3 and 7423 DF,  p-value: < 2.2e-16

anova(m2, m3)

## Analysis of Variance Table
## 
## Model 1: agertr ~ agea + gndr
## Model 2: agertr ~ agea + gndr + hinctnta
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7424 194093                                  
## 2   7423 193224  1     868.5 33.365 7.949e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

m4 <- lm(agertr ~ agea + gndr + hinctnta + eduyrs, data = ESS4)
summary(m4)

## 
## Call:
## lm(formula = agertr ~ agea + gndr + hinctnta + eduyrs, data = ESS4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.964  -2.634   0.073   2.459  57.756 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 47.26782    0.65084  72.626  < 2e-16 ***
## agea         0.24686    0.01022  24.145  < 2e-16 ***
## gndr        -1.52126    0.11930 -12.751  < 2e-16 ***
## hinctnta     0.07234    0.02410   3.002   0.0027 ** 
## eduyrs       0.13201    0.01672   7.896 3.31e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.081 on 7422 degrees of freedom
## Multiple R-squared:  0.1018, Adjusted R-squared:  0.1013 
## F-statistic: 210.3 on 4 and 7422 DF,  p-value: < 2.2e-16

anova(m3, m4)

## Analysis of Variance Table
## 
## Model 1: agertr ~ agea + gndr + hinctnta
## Model 2: agertr ~ agea + gndr + hinctnta + eduyrs
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7423 193224                                  
## 2   7422 191615  1    1609.4 62.339 3.307e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Checking Linear Regression Assumptions

Я предлагаю посмотреть только на самую интересную модель с кучей предикторов, чтобы типа еще раз попрактироваться.

par(mfrow = c(2, 2))
plot(m4)