title: ‘22041’ output: html_document: default —
library(dplyr)
library(ggplot2)
library(stringr)
library(readr)
library(sjlabelled)
library(sjmisc)
library(sjstats)
library(ggeffects)
library(sjPlot)
library(knitr)
library(psych)
library(magrittr)
library(kableExtra)
library(tidytext)
library(RColorBrewer)
library(haven)
df <- read_csv("D:/Documents/data1_1.csv")
ESS5e03 <- read_sav("D:/Documents/ESS5e03.sav")
# Read the literature and jutify what you do and want to expect
# Draw scatterplots to make sure further analysis is necessary
# Estimate the regression model
# Test the overall quality of the regression model
# Interpret regression coefficients (point estimates andconfidence intervals)
# Compare the strength of association with standardized coefficients
# Predict the outcome
df$books = as.numeric(as.character(df$books))
df$attend = as.numeric(as.character(df$attend))
df$grade = as.numeric(as.character(df$grade))
head(df)
## # A tibble: 6 x 3
## books attend grade
## <dbl> <dbl> <dbl>
## 1 0 9 45
## 2 1 15 57
## 3 0 10 45
## 4 2 16 51
## 5 4 10 65
## 6 4 20 88
str(df)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 40 obs. of 3 variables:
## $ books : num 0 1 0 2 4 4 1 4 3 0 ...
## $ attend: num 9 15 10 16 10 20 11 20 15 15 ...
## $ grade : num 45 57 45 51 65 88 44 87 89 59 ...
## - attr(*, "spec")=
## .. cols(
## .. books = col_double(),
## .. attend = col_double(),
## .. grade = col_double()
## .. )
summary(df)
## books attend grade
## Min. :0 Min. : 6.00 Min. :37.00
## 1st Qu.:1 1st Qu.:10.00 1st Qu.:51.00
## Median :2 Median :15.00 Median :60.50
## Mean :2 Mean :14.10 Mean :63.55
## 3rd Qu.:3 3rd Qu.:18.25 3rd Qu.:73.00
## Max. :4 Max. :20.00 Max. :97.00
par(mfrow=c(1, 3))
boxplot(df$books, main="books", sub=paste("Outlier rows: ", boxplot.stats(df$books)$out))
boxplot(df$attend, main="attend", sub=paste("Outlier rows: ", boxplot.stats(df$attend)$out))
boxplot(df$grade, main="grade", sub=paste("Outlier rows: ", boxplot.stats(df$grade)$out))
В графике ниже мы смотрим сразу на всё:
как распределены количественные переменные: в 1 и 3 случае это вроде называется univariate distribution (датаразбита по пяти столбикам, все одинаковой длины) И вот тут описание наших недогистограмм
коэффициент корреляции
поскольку корреляция кажется хуевенькой, мы также сразу можем посмотреть на числовые значения. И вот тут сделать выводы по ним
Это же просто гениально, можно не строить, мать его, шесть графиков!!!!!!!!!!!!*
my_fn <- function(data, mapping, ...){
p <- ggplot(data = df, mapping = mapping) +
geom_point() +
geom_smooth(method=lm, fill="blue", color="blue", ...)
p
}
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
g = ggpairs(df,columns = 1:3, lower = list(continuous = my_fn), title= "1" )
g
model1 = lm(grade ~ books, data = df)
model2 = lm(grade ~ attend, data = df)
model3 = lm(grade ~ books + attend, data = df)
summary(model1)
##
## Call:
## lm(formula = grade ~ books, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.025 -12.616 -1.181 10.425 33.450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.075 4.035 12.905 1.83e-15 ***
## books 5.738 1.647 3.483 0.00127 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.73 on 38 degrees of freedom
## Multiple R-squared: 0.242, Adjusted R-squared: 0.222
## F-statistic: 12.13 on 1 and 38 DF, p-value: 0.001265
summary(model2)
##
## Call:
## lm(formula = grade ~ attend, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.777 -10.904 2.021 12.430 31.755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.998 8.169 4.529 5.71e-05 ***
## attend 1.883 0.555 3.393 0.00163 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.83 on 38 degrees of freedom
## Multiple R-squared: 0.2325, Adjusted R-squared: 0.2123
## F-statistic: 11.51 on 1 and 38 DF, p-value: 0.001629
summary(model3)
##
## Call:
## lm(formula = grade ~ books + attend, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.802 -13.374 0.060 9.173 32.295
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.379 7.745 4.827 2.41e-05 ***
## books 4.037 1.753 2.303 0.0270 *
## attend 1.284 0.587 2.187 0.0352 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.05 on 37 degrees of freedom
## Multiple R-squared: 0.3287, Adjusted R-squared: 0.2924
## F-statistic: 9.059 on 2 and 37 DF, p-value: 0.0006278
# the algorithm for summary2
#F-statistics to look at P-value (PR(>F)) - if significant, the model is OK
#Adjusted R squared: to look how good the model is: it is : how many percent the first variable explains the second
#Estimate: the intercept if the variable if equal to 0. In the case of grade and attend(2), if we do not atend classes we receive 37. If we attend 2 classes, it will be 37 + 2*1.883
#Compare models. They should be Nested models.
anova(model1, model3)
## Analysis of Variance Table
##
## Model 1: grade ~ books
## Model 2: grade ~ books + attend
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 38 8250.4
## 2 37 7306.2 1 944.16 4.7814 0.03517 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model2, model3)
## Analysis of Variance Table
##
## Model 1: grade ~ attend
## Model 2: grade ~ books + attend
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 38 8353.4
## 2 37 7306.2 1 1047.1 5.3028 0.02701 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There is a helpful LINE mnemonic used for the Regression Assumptions:
The independence assumption is held because… Others can be checked by looking at the residuals of the models.
par(mfrow = c(2, 2))
plot(model1)
а потом еще две модели)))0
par(mfrow = c(2, 2))
plot(model2)
выводы 2
par(mfrow = c(2, 2))
plot(model3)
выводы 3
Здесь шота про бэкграунд
#data <- read_sav("ESS5e03.sav", to.data.frame = TRUE)
data = ESS5e03
savevars <- c("eduyrs", "agea", "agertr", "gndr",
"cntry", "icmnact", "hinctnta")
ESS1 <- data[savevars]
rm(data, savevars)
ESS2 = ESS1 %>%
select(eduyrs, agea, agertr, gndr, cntry, icmnact, hinctnta)
Смотрим на датасет
head(ESS2)
## # A tibble: 6 x 7
## eduyrs agea agertr gndr cntry icmnact hinctnta
## <dbl+lbl> <dbl+lb> <dbl+lbl> <dbl+lbl> <chr+lbl> <dbl+lbl> <dbl+lb>
## 1 15 22 NA 1 [Male] BE [Belgi~ 3 [All other~ NA
## 2 15 43 NA 1 [Male] BE [Belgi~ 1 [In paid w~ 5 [F]
## 3 13 19 NA 2 [Female] BE [Belgi~ 3 [All other~ 1 [J]
## 4 15 23 NA 2 [Female] BE [Belgi~ 1 [In paid w~ 10 [H]
## 5 15 58 60 2 [Female] BE [Belgi~ 3 [All other~ 6 [S]
## 6 13 62 60 2 [Female] BE [Belgi~ 2 [Retired] 5 [F]
str(ESS2)
## Classes 'tbl_df', 'tbl' and 'data.frame': 52458 obs. of 7 variables:
## $ eduyrs : 'haven_labelled' num 15 15 13 15 15 13 17 13 12 12 ...
## ..- attr(*, "label")= chr "Years of full-time education completed"
## ..- attr(*, "format.spss")= chr "F2.0"
## ..- attr(*, "labels")= Named num 77 88 99
## .. ..- attr(*, "names")= chr "Refusal" "Don't know" "No answer"
## $ agea : 'haven_labelled' num 22 43 19 23 58 62 26 40 48 68 ...
## ..- attr(*, "label")= chr "Age of respondent, calculated"
## ..- attr(*, "format.spss")= chr "F4.0"
## ..- attr(*, "labels")= Named num 999
## .. ..- attr(*, "names")= chr "Not available"
## $ agertr : 'haven_labelled' num NA NA NA NA 60 60 NA NA 58 61 ...
## ..- attr(*, "label")= chr "What age you would like to/would have liked to retire"
## ..- attr(*, "format.spss")= chr "F3.0"
## ..- attr(*, "labels")= Named num 666 777 888 999
## .. ..- attr(*, "names")= chr "Not applicable" "Refusal" "Don't know" "No answer"
## $ gndr : 'haven_labelled' num 1 1 2 2 2 2 1 1 1 1 ...
## ..- attr(*, "label")= chr "Gender"
## ..- attr(*, "format.spss")= chr "F1.0"
## ..- attr(*, "display_width")= int 6
## ..- attr(*, "labels")= Named num 1 2 9
## .. ..- attr(*, "names")= chr "Male" "Female" "No answer"
## $ cntry : 'haven_labelled' chr "BE" "BE" "BE" "BE" ...
## ..- attr(*, "label")= chr "Country"
## ..- attr(*, "format.spss")= chr "A2"
## ..- attr(*, "display_width")= int 7
## ..- attr(*, "labels")= Named chr "UA" "GB" "BE" "DE" ...
## .. ..- attr(*, "names")= chr "Ukraine" "United Kingdom" "Belgium" "Germany" ...
## $ icmnact : 'haven_labelled' num 3 1 3 1 3 2 1 1 1 2 ...
## ..- attr(*, "label")= chr "Interviewer code, main activity of respondent"
## ..- attr(*, "format.spss")= chr "F1.0"
## ..- attr(*, "display_width")= int 9
## ..- attr(*, "labels")= Named num 1 2 3 9
## .. ..- attr(*, "names")= chr "In paid work" "Retired" "All others" "Not available"
## $ hinctnta: 'haven_labelled' num NA 5 1 10 6 5 6 2 6 8 ...
## ..- attr(*, "label")= chr "Household's total net income, all sources"
## ..- attr(*, "format.spss")= chr "F2.0"
## ..- attr(*, "display_width")= int 10
## ..- attr(*, "labels")= Named num 1 2 3 4 5 6 7 8 9 10 ...
## .. ..- attr(*, "names")= chr "J" "R" "C" "M" ...
summary(ESS2)
## eduyrs agea agertr gndr
## Min. : 0.00 Min. : 14.00 Min. : 0.00 Min. :1.000
## 1st Qu.:10.00 1st Qu.: 33.00 1st Qu.: 56.00 1st Qu.:1.000
## Median :12.00 Median : 48.00 Median : 60.00 Median :2.000
## Mean :12.29 Mean : 48.51 Mean : 60.04 Mean :1.546
## 3rd Qu.:15.00 3rd Qu.: 63.00 3rd Qu.: 64.00 3rd Qu.:2.000
## Max. :55.00 Max. :102.00 Max. :120.00 Max. :2.000
## NA's :629 NA's :153 NA's :27902 NA's :21
## cntry icmnact hinctnta
## Length:52458 Min. :1.000 Min. : 1.000
## Class :haven_labelled 1st Qu.:1.000 1st Qu.: 3.000
## Mode :character Median :2.000 Median : 5.000
## Mean :1.812 Mean : 5.049
## 3rd Qu.:3.000 3rd Qu.: 7.000
## Max. :3.000 Max. :10.000
## NA's :134 NA's :12620
dim(ESS2)
## [1] 52458 7
ESS2$eduyrs = as.numeric(as.character(ESS2$eduyrs))
ESS2$agertr = as.numeric(as.character(ESS2$agertr))
ESS2$agea = as.numeric(as.character(ESS2$agea))
ESS2$hinctnta<-as.numeric(ESS1$hinctnta)
А здесь табличка!
my_fn1 <- function(data, mapping, ...){
p1 <- ggplot(data = ESS2, mapping = mapping) +
geom_point() +
geom_smooth(method=lm, fill="blue", color="blue", ...)
p1
}
g1 = ggpairs(ESS2,columns = c(1,2,3,7), lower = list(continuous = my_fn1), title= "1")
g1
ESS3 = ESS2 %>%
select(eduyrs, agea, agertr, hinctnta)
ESS3 = na.omit(ESS3)
b = cor(ESS3)
sjp.corr(ESS3, show.legend = TRUE)
If X1 and X2 interact, this means that the effect of X1 on Y depends on the value of X2, and vice verca (с) скоммунизджено
ESS4 <- na.omit(subset(ESS2, icmnact=="In paid work"))
m1 <- lm(agertr ~ agea, data = ESS2)
summary(m1)
##
## Call:
## lm(formula = agertr ~ agea, data = ESS2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.911 -4.061 0.239 3.739 60.239
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.910631 0.207751 273.94 <2e-16 ***
## agea 0.050004 0.003275 15.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.582 on 24520 degrees of freedom
## (27936 observations deleted due to missingness)
## Multiple R-squared: 0.009416, Adjusted R-squared: 0.009375
## F-statistic: 233.1 on 1 and 24520 DF, p-value: < 2.2e-16
m2 <- lm(agertr ~ agea + gndr, data = ESS2)
summary(m2)
##
## Call:
## lm(formula = agertr ~ agea + gndr, data = ESS2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.776 -3.338 0.188 3.293 59.118
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59.952025 0.228936 261.87 <2e-16 ***
## agea 0.052624 0.003221 16.34 <2e-16 ***
## gndr -2.069862 0.070435 -29.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.486 on 24515 degrees of freedom
## (27940 observations deleted due to missingness)
## Multiple R-squared: 0.04312, Adjusted R-squared: 0.04304
## F-statistic: 552.3 on 2 and 24515 DF, p-value: < 2.2e-16
#anova(m1, m2)
m3 <- lm(agertr ~ agea + gndr + hinctnta, data = ESS2)
summary(m3)
##
## Call:
## lm(formula = agertr ~ agea + gndr + hinctnta, data = ESS2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.220 -3.276 0.268 3.265 58.857
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.982941 0.305822 192.867 <2e-16 ***
## agea 0.055696 0.003827 14.553 <2e-16 ***
## gndr -2.031120 0.078847 -25.760 <2e-16 ***
## hinctnta 0.146232 0.015274 9.574 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.476 on 19786 degrees of freedom
## (32668 observations deleted due to missingness)
## Multiple R-squared: 0.04613, Adjusted R-squared: 0.04598
## F-statistic: 319 on 3 and 19786 DF, p-value: < 2.2e-16
#anova(m2, m3)
m4 <- lm(agertr ~ agea + gndr + hinctnta + eduyrs, data = ESS2)
summary(m4)
##
## Call:
## lm(formula = agertr ~ agea + gndr + hinctnta + eduyrs, data = ESS2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.083 -3.259 0.197 3.204 58.719
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.360370 0.338433 169.488 < 2e-16 ***
## agea 0.063566 0.003897 16.311 < 2e-16 ***
## gndr -2.044813 0.078892 -25.919 < 2e-16 ***
## hinctnta 0.084504 0.016129 5.239 1.63e-07 ***
## eduyrs 0.122402 0.010612 11.535 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.46 on 19645 degrees of freedom
## (32808 observations deleted due to missingness)
## Multiple R-squared: 0.05235, Adjusted R-squared: 0.05216
## F-statistic: 271.3 on 4 and 19645 DF, p-value: < 2.2e-16
#anova(m3, m4)
library(rgl)
library(plot3Drgl)
library(plot3D)
#library(his3D)
ESS4 = na.omit(ESS2)
ESS4 = ESS4 %>%
select(agea, agertr, gndr, eduyrs)
#hist3D(ESS4$agea, ESS4$agertr, ESS4$agea, colvar=as.numeric(ESS4$gndr))
#plotrgl()
#scatter3Drgl()
#?hist3D_fancy
#data("iris")
#hist3D(iris$Sepal.Length, , colvar=as.numeric())
#ESS4$gndr = as.factor(ESS4$gndr)
#hist3D(, ,, border="black")
a = scatter3D(x = ESS4$agea, z = ESS4$eduyrs, y = ESS4$agertr, data = ESS4, groups = factor(ESS4$gndr))
a
## [,1] [,2] [,3] [,4]
## [1,] 1.802458e-02 -0.009721786 0.011585974 -0.011585974
## [2,] 1.071313e-02 0.008206731 -0.009780401 0.009780401
## [3,] -1.574324e-18 0.030641778 0.025711504 -0.025711504
## [4,] -1.697225e+00 -0.689723830 -3.465793782 4.465793782
#hist3D(x = ESS4$agea, z = ESS4$eduyrs, y = ESS4$agertr, data = ESS4, groups = factor(ESS4$gndr))
#plot3Drgl