title: ‘22041’ output: html_document: default —

library(dplyr)
library(ggplot2)
library(stringr)
library(readr)
library(sjlabelled)
library(sjmisc) 
library(sjstats) 
library(ggeffects)  
library(sjPlot)
library(knitr)
library(psych)
library(magrittr)
library(kableExtra)
library(tidytext)
library(RColorBrewer)
library(haven)

df <- read_csv("D:/Documents/data1_1.csv")
ESS5e03 <- read_sav("D:/Documents/ESS5e03.sav")

# Read the literature and jutify what you do and want to expect
# Draw scatterplots to make sure further analysis is necessary
# Estimate the regression model
# Test the overall quality of the regression model
# Interpret regression coefficients (point estimates andconfidence intervals)
# Compare the strength of association with standardized coefficients
# Predict the outcome

df$books = as.numeric(as.character(df$books))
df$attend = as.numeric(as.character(df$attend))
df$grade = as.numeric(as.character(df$grade))

Have a look at the data we have

head(df)

## # A tibble: 6 x 3
##   books attend grade
##   <dbl>  <dbl> <dbl>
## 1     0      9    45
## 2     1     15    57
## 3     0     10    45
## 4     2     16    51
## 5     4     10    65
## 6     4     20    88

str(df)

## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 40 obs. of  3 variables:
##  $ books : num  0 1 0 2 4 4 1 4 3 0 ...
##  $ attend: num  9 15 10 16 10 20 11 20 15 15 ...
##  $ grade : num  45 57 45 51 65 88 44 87 89 59 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   books = col_double(),
##   ..   attend = col_double(),
##   ..   grade = col_double()
##   .. )

summary(df)

##      books       attend          grade      
##  Min.   :0   Min.   : 6.00   Min.   :37.00  
##  1st Qu.:1   1st Qu.:10.00   1st Qu.:51.00  
##  Median :2   Median :15.00   Median :60.50  
##  Mean   :2   Mean   :14.10   Mean   :63.55  
##  3rd Qu.:3   3rd Qu.:18.25   3rd Qu.:73.00  
##  Max.   :4   Max.   :20.00   Max.   :97.00

Using BoxPlot To Check For Outliers

par(mfrow=c(1, 3))
boxplot(df$books, main="books", sub=paste("Outlier rows: ", boxplot.stats(df$books)$out))
boxplot(df$attend, main="attend", sub=paste("Outlier rows: ", boxplot.stats(df$attend)$out))
boxplot(df$grade, main="grade", sub=paste("Outlier rows: ", boxplot.stats(df$grade)$out))

Using Scatter Plot To Visualise The Relationship

Using Density Plot To Check If Variable Is Close To Normal

В графике ниже мы смотрим сразу на всё:

как распределены количественные переменные: в 1 и 3 случае это вроде называется univariate distribution (датаразбита по пяти столбикам, все одинаковой длины) И вот тут описание наших недогистограмм
коэффициент корреляции
поскольку корреляция кажется хуевенькой, мы также сразу можем посмотреть на числовые значения. И вот тут сделать выводы по ним
Это же просто гениально, можно не строить, мать его, шесть графиков!!!!!!!!!!!!*

my_fn <- function(data, mapping, ...){
  p <- ggplot(data = df, mapping = mapping) + 
    geom_point() + 
    geom_smooth(method=lm, fill="blue", color="blue", ...)
  p
}

library(GGally)

## 
## Attaching package: 'GGally'

## The following object is masked from 'package:dplyr':
## 
##     nasa

g = ggpairs(df,columns = 1:3, lower = list(continuous = my_fn), title= "1" )
g

Делаем модели

model1 = lm(grade ~ books, data = df)
model2 = lm(grade ~ attend, data = df)
model3 = lm(grade ~ books + attend, data = df)

Смотрим саммери
Сначала смотрим на F-Ratio, проверяем, что P-value (PR(>F)) is significant
Потом смотрим на Adjusted R squared - the share of variance of outcome variable that explains the model: from 0 to 1. It shows how good the model is.
Дальше смотрим на Estimate: intercept здесь определение и slope здесь тоже определение.

summary(model1)

## 
## Call:
## lm(formula = grade ~ books, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.025 -12.616  -1.181  10.425  33.450 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   52.075      4.035  12.905 1.83e-15 ***
## books          5.738      1.647   3.483  0.00127 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.73 on 38 degrees of freedom
## Multiple R-squared:  0.242,  Adjusted R-squared:  0.222 
## F-statistic: 12.13 on 1 and 38 DF,  p-value: 0.001265

summary(model2)

## 
## Call:
## lm(formula = grade ~ attend, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.777 -10.904   2.021  12.430  31.755 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   36.998      8.169   4.529 5.71e-05 ***
## attend         1.883      0.555   3.393  0.00163 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.83 on 38 degrees of freedom
## Multiple R-squared:  0.2325, Adjusted R-squared:  0.2123 
## F-statistic: 11.51 on 1 and 38 DF,  p-value: 0.001629

summary(model3)

## 
## Call:
## lm(formula = grade ~ books + attend, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.802 -13.374   0.060   9.173  32.295 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   37.379      7.745   4.827 2.41e-05 ***
## books          4.037      1.753   2.303   0.0270 *  
## attend         1.284      0.587   2.187   0.0352 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.05 on 37 degrees of freedom
## Multiple R-squared:  0.3287, Adjusted R-squared:  0.2924 
## F-statistic: 9.059 on 2 and 37 DF,  p-value: 0.0006278

# the algorithm for summary2
#F-statistics to look at P-value (PR(>F)) - if significant, the model is OK
#Adjusted R squared: to look how good the model is: it is : how many percent the first variable explains the second 
#Estimate: the intercept if the variable if equal to 0. In the case of grade and attend(2), if we do not atend classes we receive 37. If we attend 2 classes, it will be 37 + 2*1.883

Comparing Models

#Compare models. They should be Nested models. 
anova(model1, model3)

## Analysis of Variance Table
## 
## Model 1: grade ~ books
## Model 2: grade ~ books + attend
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     38 8250.4                              
## 2     37 7306.2  1    944.16 4.7814 0.03517 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova(model2, model3)

## Analysis of Variance Table
## 
## Model 1: grade ~ attend
## Model 2: grade ~ books + attend
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     38 8353.4                              
## 2     37 7306.2  1    1047.1 5.3028 0.02701 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Checking Linear Regression Assumptions

There is a helpful LINE mnemonic used for the Regression Assumptions:

L- linearity- the Y-values can be expressed as a linear function of X-variable
I- independence - the errors are independent
N- normality - values are normally distributed
E- equal variance (homoscedasticity) - variation of observations around the regression line is constant

The independence assumption is held because… Others can be checked by looking at the residuals of the models.

par(mfrow = c(2, 2))
plot(model1)

For linearity assumption: look at Residuals vs.Fitted plot. There are predicted Y-values displayed on x-axis and Residuals (or errors) on y-axis. If the assumption is met, the red line should be fairly flat (перефразируй) . Выводы по нашей картинке
For normality assumption: look at Q-Q plot. If residuals (or Y-values) are normally distributed, the points should fall roughly on the diagonal line ** перефраз. Выводы**
For homoscedasticity assumption: look at Scale-Location & Residuals vs. Leverage plots to identify non-linearities, non-constant variance and troublesome observations. Этого я не поняла, надо загуглить и уточнить

а потом еще две модели)))0

par(mfrow = c(2, 2))
plot(model2)

выводы 2

par(mfrow = c(2, 2))
plot(model3)

выводы 3

Second regression

Здесь шота про бэкграунд

#data <- read_sav("ESS5e03.sav",  to.data.frame = TRUE)
data = ESS5e03
savevars <- c("eduyrs", "agea", "agertr", "gndr", 
              "cntry", "icmnact", "hinctnta")
ESS1 <- data[savevars]
rm(data, savevars)

Describing variables

ESS2 = ESS1 %>% 
  select(eduyrs, agea, agertr, gndr, cntry, icmnact, hinctnta)

Смотрим на датасет

head(ESS2)

## # A tibble: 6 x 7
##      eduyrs     agea    agertr       gndr cntry            icmnact hinctnta
##   <dbl+lbl> <dbl+lb> <dbl+lbl>  <dbl+lbl> <chr+lbl>      <dbl+lbl> <dbl+lb>
## 1        15       22        NA 1 [Male]   BE [Belgi~ 3 [All other~   NA    
## 2        15       43        NA 1 [Male]   BE [Belgi~ 1 [In paid w~    5 [F]
## 3        13       19        NA 2 [Female] BE [Belgi~ 3 [All other~    1 [J]
## 4        15       23        NA 2 [Female] BE [Belgi~ 1 [In paid w~   10 [H]
## 5        15       58        60 2 [Female] BE [Belgi~ 3 [All other~    6 [S]
## 6        13       62        60 2 [Female] BE [Belgi~ 2 [Retired]      5 [F]

str(ESS2)

## Classes 'tbl_df', 'tbl' and 'data.frame':    52458 obs. of  7 variables:
##  $ eduyrs  : 'haven_labelled' num  15 15 13 15 15 13 17 13 12 12 ...
##   ..- attr(*, "label")= chr "Years of full-time education completed"
##   ..- attr(*, "format.spss")= chr "F2.0"
##   ..- attr(*, "labels")= Named num  77 88 99
##   .. ..- attr(*, "names")= chr  "Refusal" "Don't know" "No answer"
##  $ agea    : 'haven_labelled' num  22 43 19 23 58 62 26 40 48 68 ...
##   ..- attr(*, "label")= chr "Age of respondent, calculated"
##   ..- attr(*, "format.spss")= chr "F4.0"
##   ..- attr(*, "labels")= Named num 999
##   .. ..- attr(*, "names")= chr "Not available"
##  $ agertr  : 'haven_labelled' num  NA NA NA NA 60 60 NA NA 58 61 ...
##   ..- attr(*, "label")= chr "What age you would like to/would have liked to retire"
##   ..- attr(*, "format.spss")= chr "F3.0"
##   ..- attr(*, "labels")= Named num  666 777 888 999
##   .. ..- attr(*, "names")= chr  "Not applicable" "Refusal" "Don't know" "No answer"
##  $ gndr    : 'haven_labelled' num  1 1 2 2 2 2 1 1 1 1 ...
##   ..- attr(*, "label")= chr "Gender"
##   ..- attr(*, "format.spss")= chr "F1.0"
##   ..- attr(*, "display_width")= int 6
##   ..- attr(*, "labels")= Named num  1 2 9
##   .. ..- attr(*, "names")= chr  "Male" "Female" "No answer"
##  $ cntry   : 'haven_labelled' chr  "BE" "BE" "BE" "BE" ...
##   ..- attr(*, "label")= chr "Country"
##   ..- attr(*, "format.spss")= chr "A2"
##   ..- attr(*, "display_width")= int 7
##   ..- attr(*, "labels")= Named chr  "UA" "GB" "BE" "DE" ...
##   .. ..- attr(*, "names")= chr  "Ukraine" "United Kingdom" "Belgium" "Germany" ...
##  $ icmnact : 'haven_labelled' num  3 1 3 1 3 2 1 1 1 2 ...
##   ..- attr(*, "label")= chr "Interviewer code, main activity of respondent"
##   ..- attr(*, "format.spss")= chr "F1.0"
##   ..- attr(*, "display_width")= int 9
##   ..- attr(*, "labels")= Named num  1 2 3 9
##   .. ..- attr(*, "names")= chr  "In paid work" "Retired" "All others" "Not available"
##  $ hinctnta: 'haven_labelled' num  NA 5 1 10 6 5 6 2 6 8 ...
##   ..- attr(*, "label")= chr "Household's total net income, all sources"
##   ..- attr(*, "format.spss")= chr "F2.0"
##   ..- attr(*, "display_width")= int 10
##   ..- attr(*, "labels")= Named num  1 2 3 4 5 6 7 8 9 10 ...
##   .. ..- attr(*, "names")= chr  "J" "R" "C" "M" ...

summary(ESS2)

##      eduyrs           agea            agertr            gndr      
##  Min.   : 0.00   Min.   : 14.00   Min.   :  0.00   Min.   :1.000  
##  1st Qu.:10.00   1st Qu.: 33.00   1st Qu.: 56.00   1st Qu.:1.000  
##  Median :12.00   Median : 48.00   Median : 60.00   Median :2.000  
##  Mean   :12.29   Mean   : 48.51   Mean   : 60.04   Mean   :1.546  
##  3rd Qu.:15.00   3rd Qu.: 63.00   3rd Qu.: 64.00   3rd Qu.:2.000  
##  Max.   :55.00   Max.   :102.00   Max.   :120.00   Max.   :2.000  
##  NA's   :629     NA's   :153      NA's   :27902    NA's   :21     
##     cntry                   icmnact         hinctnta     
##  Length:52458            Min.   :1.000   Min.   : 1.000  
##  Class :haven_labelled   1st Qu.:1.000   1st Qu.: 3.000  
##  Mode  :character        Median :2.000   Median : 5.000  
##                          Mean   :1.812   Mean   : 5.049  
##                          3rd Qu.:3.000   3rd Qu.: 7.000  
##                          Max.   :3.000   Max.   :10.000  
##                          NA's   :134     NA's   :12620

dim(ESS2)

## [1] 52458     7

ESS2$eduyrs = as.numeric(as.character(ESS2$eduyrs))
ESS2$agertr = as.numeric(as.character(ESS2$agertr))
ESS2$agea = as.numeric(as.character(ESS2$agea))
ESS2$hinctnta<-as.numeric(ESS1$hinctnta)

А здесь табличка!

Graphs for all variables

my_fn1 <- function(data, mapping, ...){
  p1 <- ggplot(data = ESS2, mapping = mapping) + 
    geom_point() + 
    geom_smooth(method=lm, fill="blue", color="blue", ...)
  p1
}

g1 = ggpairs(ESS2,columns = c(1,2,3,7), lower = list(continuous = my_fn1), title= "1")
g1

ESS3 = ESS2 %>% 
  select(eduyrs, agea, agertr,  hinctnta)
ESS3 = na.omit(ESS3)

b = cor(ESS3)
sjp.corr(ESS3, show.legend = TRUE)

Multiple Linear Regression

If X1 and X2 interact, this means that the effect of X1 on Y depends on the value of X2, and vice verca (с) скоммунизджено

ESS4 <- na.omit(subset(ESS2, icmnact=="In paid work"))

m1 <- lm(agertr ~ agea, data = ESS2)
summary(m1)

## 
## Call:
## lm(formula = agertr ~ agea, data = ESS2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.911  -4.061   0.239   3.739  60.239 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 56.910631   0.207751  273.94   <2e-16 ***
## agea         0.050004   0.003275   15.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.582 on 24520 degrees of freedom
##   (27936 observations deleted due to missingness)
## Multiple R-squared:  0.009416,   Adjusted R-squared:  0.009375 
## F-statistic: 233.1 on 1 and 24520 DF,  p-value: < 2.2e-16

m2 <- lm(agertr ~ agea + gndr, data = ESS2)
summary(m2)

## 
## Call:
## lm(formula = agertr ~ agea + gndr, data = ESS2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.776  -3.338   0.188   3.293  59.118 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 59.952025   0.228936  261.87   <2e-16 ***
## agea         0.052624   0.003221   16.34   <2e-16 ***
## gndr        -2.069862   0.070435  -29.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.486 on 24515 degrees of freedom
##   (27940 observations deleted due to missingness)
## Multiple R-squared:  0.04312,    Adjusted R-squared:  0.04304 
## F-statistic: 552.3 on 2 and 24515 DF,  p-value: < 2.2e-16

#anova(m1, m2)

m3 <- lm(agertr ~ agea + gndr + hinctnta, data = ESS2)
summary(m3)

## 
## Call:
## lm(formula = agertr ~ agea + gndr + hinctnta, data = ESS2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.220  -3.276   0.268   3.265  58.857 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 58.982941   0.305822 192.867   <2e-16 ***
## agea         0.055696   0.003827  14.553   <2e-16 ***
## gndr        -2.031120   0.078847 -25.760   <2e-16 ***
## hinctnta     0.146232   0.015274   9.574   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.476 on 19786 degrees of freedom
##   (32668 observations deleted due to missingness)
## Multiple R-squared:  0.04613,    Adjusted R-squared:  0.04598 
## F-statistic:   319 on 3 and 19786 DF,  p-value: < 2.2e-16

#anova(m2, m3)

m4 <- lm(agertr ~ agea + gndr + hinctnta + eduyrs, data = ESS2)
summary(m4)

## 
## Call:
## lm(formula = agertr ~ agea + gndr + hinctnta + eduyrs, data = ESS2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.083  -3.259   0.197   3.204  58.719 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 57.360370   0.338433 169.488  < 2e-16 ***
## agea         0.063566   0.003897  16.311  < 2e-16 ***
## gndr        -2.044813   0.078892 -25.919  < 2e-16 ***
## hinctnta     0.084504   0.016129   5.239 1.63e-07 ***
## eduyrs       0.122402   0.010612  11.535  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.46 on 19645 degrees of freedom
##   (32808 observations deleted due to missingness)
## Multiple R-squared:  0.05235,    Adjusted R-squared:  0.05216 
## F-statistic: 271.3 on 4 and 19645 DF,  p-value: < 2.2e-16

#anova(m3, m4)

library(rgl)
library(plot3Drgl)
library(plot3D)
#library(his3D)

ESS4 = na.omit(ESS2)
ESS4 = ESS4 %>% 
  select(agea, agertr, gndr, eduyrs)
#hist3D(ESS4$agea, ESS4$agertr, ESS4$agea, colvar=as.numeric(ESS4$gndr))
#plotrgl()
#scatter3Drgl()
#?hist3D_fancy
#data("iris")
#hist3D(iris$Sepal.Length, , colvar=as.numeric())
#ESS4$gndr = as.factor(ESS4$gndr)
#hist3D(, ,, border="black")

a = scatter3D(x = ESS4$agea, z = ESS4$eduyrs, y = ESS4$agertr, data = ESS4, groups = factor(ESS4$gndr))

##               [,1]         [,2]         [,3]         [,4]
## [1,]  1.802458e-02 -0.009721786  0.011585974 -0.011585974
## [2,]  1.071313e-02  0.008206731 -0.009780401  0.009780401
## [3,] -1.574324e-18  0.030641778  0.025711504 -0.025711504
## [4,] -1.697225e+00 -0.689723830 -3.465793782  4.465793782

#hist3D(x = ESS4$agea, z = ESS4$eduyrs, y = ESS4$agertr, data = ESS4, groups = factor(ESS4$gndr))
#plot3Drgl