We downloaded the ESS data for Norway for 2016 and looked at some variables.

head(ESS1)
##     gndr agea    health       hinctnta eduyrs           happy
## 1   Male   74      Good M - 4th decile     10               8
## 2   Male   17      Fair D - 9th decile     12               8
## 3   Male   54 Very good F - 5th decile      9               9
## 4 Female   50      Good S - 6th decile     17 Extremely happy
## 5   Male   25 Very good R - 2nd decile     13               8
## 6   Male   43      Good J - 1st decile     11               9
ESS1$health <-as.numeric(ESS1$health)
hist(ESS1$health, breaks = 6, col="#00FFFF", main="Subjective general health", xlab="health")
abline(v = mean(ESS1$health), col = "red", lwd = 3)
abline(v = median(ESS1$health), col = "blue", lwd = 3)

ESS1$hinctnta <-as.numeric(ESS1$hinctnta)
hist(ESS1$hinctnta, breaks = 10, col="#00FFFF", main="Income", xlab="income")
abline(v = mean(ESS1$hinctnta), col = "red", lwd = 3)
abline(v = median(ESS1$hinctnta), col = "blue", lwd = 3)

ESS1$eduyrs <-as.numeric(ESS1$eduyrs)
hist(ESS1$eduyrs, breaks = 20, col="#00FFFF", main="Education (years)", xlab="years of education")
abline(v = mean(ESS1$eduyrs), col = "red", lwd = 3)
abline(v = median(ESS1$eduyrs), col = "blue", lwd = 3)

ESS1$happy <-as.numeric(ESS1$happy)
hist(ESS1$happy, breaks = 20, col="#00FFFF", main="Happiness (level)", xlab="degree of happiness")
abline(v = mean(ESS1$happy), col = "red", lwd = 3)
abline(v = median(ESS1$happy), col = "blue", lwd = 3)

ESS1$agea <-as.numeric(ESS1$agea)
hist(ESS1$agea, breaks = 20, col="#00FFFF", main="Age", xlab="years")
abline(v = mean(ESS1$agea), col = "red", lwd = 3)
abline(v = median(ESS1$agea), col = "blue", lwd = 3)

summary(ESS1)
##      gndr          agea           health         hinctnta     
##  Male  :794   Min.   : 1.00   Min.   :1.000   Min.   : 1.000  
##  Female:669   1st Qu.:18.00   1st Qu.:1.000   1st Qu.: 3.000  
##               Median :33.00   Median :2.000   Median : 5.000  
##               Mean   :33.42   Mean   :1.965   Mean   : 5.266  
##               3rd Qu.:47.50   3rd Qu.:2.000   3rd Qu.: 7.000  
##               Max.   :82.00   Max.   :5.000   Max.   :10.000  
##      eduyrs          happy       
##  Min.   : 1.00   Min.   : 1.000  
##  1st Qu.:11.00   1st Qu.: 9.000  
##  Median :13.00   Median : 9.000  
##  Mean   :13.22   Mean   : 9.123  
##  3rd Qu.:16.00   3rd Qu.:10.000  
##  Max.   :25.00   Max.   :11.000

At the beggining, let’s calculate coefficients before cheking for outliers.

cor.test(ESS1$hinctnta, ESS1$eduyrs, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  ESS1$hinctnta and ESS1$eduyrs
## t = 10.599, df = 1461, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2189715 0.3141694
## sample estimates:
##       cor 
## 0.2672223
m1 <- lm(ESS1$eduyrs ~ ESS1$hinctnta)
summary(m1)
## 
## Call:
## lm(formula = ESS1$eduyrs ~ ESS1$hinctnta)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.2157  -2.4884  -0.0338   2.5116  11.9662 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   11.30654    0.20309   55.67   <2e-16 ***
## ESS1$hinctnta  0.36365    0.03431   10.60   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.549 on 1461 degrees of freedom
## Multiple R-squared:  0.07141,    Adjusted R-squared:  0.07077 
## F-statistic: 112.3 on 1 and 1461 DF,  p-value: < 2.2e-16
plot(ESS1$hinctnta, ESS1$eduyrs, pch=16,  xlab = "level of income (deciles)", ylab = "years of education")

As scatterplot shows, there can be some outliers. So, further we will to look at it with the function.

Let’s define if there is outliers

library(dplyr)
library(ggplot2)
is_outlier <- function(x) {
  return(x < quantile(x, 0.25) - 1.5 * IQR(x) | x > quantile(x, 0.75) + 1.5 * IQR(x))
}
dat <- ESS1 %>% tibble::rownames_to_column(var="outlier") %>% mutate(is_outlier=ifelse(is_outlier(eduyrs), eduyrs, as.numeric(NA)))
dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)
car::Boxplot(ESS1$eduyrs~ESS1$hinctnta, xlab = "level of income(deciles)", ylab = "years of education", col = "blue", method = "y")

##  [1] "224"  "391"  "1193" "1330" "704"  "442"  "299"  "1299" "1242" "570" 
## [11] "1444" "538"  "1087" "743"

It has shown to us, that there are some outliers, which can underestimate or overestimate the parameters of our model. Let’s delete the outliers.

library(car)
ESS1 <- ESS1[-c(224, 391,  1193, 1330, 704,  442,  299, 1299, 1242, 
570, 1444,  538, 1087, 743),]
car::Boxplot(ESS1$eduyrs~ESS1$hinctnta, xlab = "level of income(deciles)", ylab = "years of education", col = "blue", method = "y")

Checking the ssumptions

After deleting the outliers, we need also check other assumption, so that we can generalize for the all population.

Firstly, we check the normal distribution of residuals with Shapiro test. H0: the residuals are distributed normally. H1: the residuals are distributed not normally

model = lm(ESS1$eduyrs ~ ESS1$hinctnta)
res = model$residuals 
shapiro.test(res)
## 
##  Shapiro-Wilk normality test
## 
## data:  res
## W = 0.99777, p-value = 0.04413

The residuals are distributed not normal, because Shapiro-Wilk test shows significant result (p-value = 0.04413 (< 0,05)), so we reject the null hypothesis of normal distribution. However, it is not very significant (not more than 0,01). In addition, this test can overestimate the departures from the null hypothesis for large sample, which is our too.

hist(model$residuals ,
     main="The distribution of residuals", 
     xlab = "residuals",
     border="blue", 
     col="yellow")

qqnorm(res); qqline(res, col= 2)

From histogram ang Q-Q plot, however, we can conclude, that the distribution is very similar to normal.

Next, we have to check the homogenity of variances. We will use the Levene test for non-normal distribution. Ho: the variances are equal across all the samples. H1: the variances are unequal

leveneTest(as.numeric(ESS1$eduyrs) ~ as.factor(ESS1$hinctnta))
## Levene's Test for Homogeneity of Variance (center = median)
##         Df F value  Pr(>F)  
## group    9  1.6757 0.08987 .
##       1439                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As, the Levenest test shows unsignificant result (p-value=0.08987 (>0.05)), we can not reject the null hypothesis, so the variances are equal

Now we can make some calsulations for our model.

Calculating correlation

cor.test(ESS1$hinctnta, ESS1$eduyrs, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  ESS1$hinctnta and ESS1$eduyrs
## t = 11.332, df = 1447, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2374867 0.3321060
## sample estimates:
##       cor 
## 0.2854919

As we can see, before deleting outliers, the correlation coefficient was 0.2672223, after deleting the outliers the correlation coefficient has increased to 0.2854919.

Scatterplot for level of income and years of education

plot(ESS1$hinctnta, ESS1$eduyrs, pch=16,  xlab = "level of income (deciles)", ylab = "years of education")

The scatterplot looks a little bit different now, as the outliers have been deleted.

plot(ESS1$hinctnta[ESS1$gndr == "Female"], ESS1$eduyrs[ESS1$gndr == "Female"], pch=16, xlab = "level of income (deciles)", ylab = "years of education", main = "Only females")

plot(ESS1$hinctnta[ESS1$gndr == "Male"], ESS1$eduyrs[ESS1$gndr == "Male"], pch=16, xlab = "level of income (deciles)", ylab = "years of education", main = "Only males")

Regression

First model

m1 <- lm(ESS1$eduyrs ~ ESS1$hinctnta)
summary(m1)
## 
## Call:
## lm(formula = ESS1$eduyrs ~ ESS1$hinctnta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7952 -2.4222 -0.0492  2.4589  9.7129 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   11.30315    0.19493   57.98   <2e-16 ***
## ESS1$hinctnta  0.37300    0.03292   11.33   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.392 on 1447 degrees of freedom
## Multiple R-squared:  0.08151,    Adjusted R-squared:  0.08087 
## F-statistic: 128.4 on 1 and 1447 DF,  p-value: < 2.2e-16

Like correlation coefficient, the coefficient determination has increased from 0.07141 (adjusted R-squared: 0.07077) before deleting outliers to 0.08151, (adjusted R-squared: 0.08087) after deleting the outliers. The b and the intercept have increased slightly too.

plot(ESS1$hinctnta , ESS1$eduyrs, col = "blue", pch = 16, xlab = "years of education", ylab = "level of income (deciles)")
abline(m1, col = "red")

ESS1$gndr <- as.numeric(ESS1$gndr)
ESS1$hinctnta <- as.numeric(ESS1$hinctnta)
m2 <- lm(eduyrs ~ hinctnta + gndr, data = ESS1)
summary(m2)
## 
## Call:
## lm(formula = eduyrs ~ hinctnta + gndr, data = ESS1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5429 -2.4077  0.0551  2.4063  9.4063 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.49206    0.33248  31.557  < 2e-16 ***
## hinctnta     0.37840    0.03287  11.510  < 2e-16 ***
## gndr         0.53720    0.17865   3.007  0.00268 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.382 on 1446 degrees of freedom
## Multiple R-squared:  0.08721,    Adjusted R-squared:  0.08595 
## F-statistic: 69.08 on 2 and 1446 DF,  p-value: < 2.2e-16
anova(m1, m2) 
## Analysis of Variance Table
## 
## Response: ESS1$eduyrs
##                 Df  Sum Sq Mean Sq F value    Pr(>F)    
## ESS1$hinctnta    1  1477.2  1477.2   128.4 < 2.2e-16 ***
## Residuals     1447 16646.9    11.5                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(stargazer)
stargazer(m1, m2, type = "text")
## 
## ======================================================================
##                                    Dependent variable:                
##                     --------------------------------------------------
##                              eduyrs                    eduyrs         
##                                (1)                      (2)           
## ----------------------------------------------------------------------
## hinctnta                    0.373***                                  
##                              (0.033)                                  
##                                                                       
## hinctnta                                              0.378***        
##                                                       (0.033)         
##                                                                       
## gndr                                                  0.537***        
##                                                       (0.179)         
##                                                                       
## Constant                    11.303***                10.492***        
##                              (0.195)                  (0.332)         
##                                                                       
## ----------------------------------------------------------------------
## Observations                  1,449                    1,449          
## R2                            0.082                    0.087          
## Adjusted R2                   0.081                    0.086          
## Residual Std. Error     3.392 (df = 1447)        3.382 (df = 1446)    
## F Statistic         128.404*** (df = 1; 1447) 69.080*** (df = 2; 1446)
## ======================================================================
## Note:                                      *p<0.1; **p<0.05; ***p<0.01
library(sjPlot)
sjt.lm(m2, show.ci = F)
    eduyrs
    B p
(Intercept)   10.49 <.001
hinctnta   0.38 <.001
gndr   0.54 .003
Observations   1449
R2 / adj. R2   .087 / .086