#knitr::opts_chunk$set(echo = TRUE)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

MØST: Statistikk og regresjonsanalyse ———-

Dette scriptet inneholder koden som vi går gjennom på samlingen. Sett først wd til samme lokasjon som prosjektfilene.

Laster inn nødvendige pakker. Disse pakkene er installert i rstudio.cloud, men

må installeres på forhånd dersom du jobber på egen maskin med desktopversjonen

av RStudio. Se for eksempel her for instruksjoner:

https://hotneim.github.io/met4/pakker.html

library(readxl)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)

#Setter filområdet vi skal hente filer fra

setwd("C:/Users/kich/Desktop/R_most")

Leser inn salgsdatafilen og ser på de øverste radene:

salg <- read_excel("salg.xlsx")
salg
## # A tibble: 50 × 3
##       x1    x2     y
##    <dbl> <dbl> <dbl>
##  1    27     0  237.
##  2    21     0  226.
##  3    47     0  261.
##  4    47     0  379.
##  5    24     0  177.
##  6    43     0  354.
##  7    27     0  144.
##  8    36     0  226.
##  9    32     0  222.
## 10    34     0  344.
## # ℹ 40 more rows

Lager et spredningsplott for å se på forholdet mellom antall selgere på jobb,

x1, og antall salg, y.´

plot(salg$x1, salg$y)

#alternativt med ggplot

setwd("C:/Users/kich/Desktop/R_most")
salg <- read_excel("salg.xlsx")
#View(salg)
ggplot(salg) +
  geom_point(aes(x = x1, y = y),
    color = "blue", size = 2, pch = 19) +
  ggtitle("Antall salg per selger") +
  xlab("Antall salg") +
  ylab("Antall selgere pa jobb")

# Vi estimerer en enkel lineær regresjonsmodell og lagrer den i objektet “reg1”:

reg1 <- lm(y ~ x1, data = salg)

Skriver ut et sammendrag av den estimerte modellen:

summary(reg1)
## 
## Call:
## lm(formula = y ~ x1, data = salg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -108.720  -33.162    1.282   32.846  105.968 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  15.7712    26.5534   0.594    0.555    
## x1            6.9935     0.7636   9.159 4.15e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49.58 on 48 degrees of freedom
## Multiple R-squared:  0.636,  Adjusted R-squared:  0.6285 
## F-statistic: 83.88 on 1 and 48 DF,  p-value: 4.147e-12

Plotter på nytt og tegner inn den estimerte modellen

plot(salg$x1, salg$y)
abline(reg1)

# Lager residualplott

plot(reg1$fitted.values, reg1$residuals)

# Lager histogram

hist(reg1$residuals, breaks = 12)

# Lager QQ-plot

qqnorm(reg1$residuals)
qqline(reg1$residuals)

# Lager autokorrelasjonsplott

acf(reg1$residuals)

# Multippel regresjon bruker samme syntaks. Vi bruker “+” mellom forklaringsvariablene

reg2 <- lm(y ~ x1 + x2, data = salg)
summary(reg2)
## 
## Call:
## lm(formula = y ~ x1 + x2, data = salg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -107.223  -34.980    0.189   31.789  103.762 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  13.1982    28.5835   0.462    0.646    
## x1            7.0151     0.7755   9.045 7.36e-12 ***
## x2            3.7028    14.2437   0.260    0.796    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.07 on 47 degrees of freedom
## Multiple R-squared:  0.6366, Adjusted R-squared:  0.6211 
## F-statistic: 41.16 on 2 and 47 DF,  p-value: 4.678e-11

Kan også lage en fin tabell ved hjelp av “stargazer”-pakken som vi lastet inn helt i starten av scriptet.

stargazer(reg2, type = "text")
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                  y             
## -----------------------------------------------
## x1                           7.015***          
##                               (0.776)          
##                                                
## x2                             3.703           
##                              (14.244)          
##                                                
## Constant                      13.198           
##                              (28.584)          
##                                                
## -----------------------------------------------
## Observations                    50             
## R2                             0.637           
## Adjusted R2                    0.621           
## Residual Std. Error      50.072 (df = 47)      
## F Statistic           41.161*** (df = 2; 47)   
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

Pass opp for multikolinearitet når du gjør multippel regresjon!

skoledata <- read_excel("skoledata.xls", na = "NA")
head(skoledata)
## # A tibble: 6 × 11
##   kommunenavn    id driftsutgifter mobbing lesing antall fritak folketall bokmal
##   <chr>       <dbl>          <dbl>   <dbl>  <dbl>  <dbl>  <dbl>     <dbl>  <dbl>
## 1 Agdenes ko…  1622          116.     NA       44     24   NA        1733      1
## 2 Alstahaug …  1820          103.      3.6     47     78   11.8      7437      0
## 3 Alta kommu…  2012           99.3     2.9     50    255    7.2     20097      1
## 4 Alvdal kom…   438          102.      0       46     30   NA        2426      0
## 5 Andøy komm…  1871          115.     10       47     47   10.6      4980      0
## 6 Aremark ko…   118          113.     NA       NA     15   NA        1404      1
## # ℹ 2 more variables: nynorsk <dbl>, nord <dbl>
reg_skole1 <- lm(lesing ~ mobbing + log(folketall) + log(antall), data = skoledata)
reg_skole2 <- lm(lesing ~ mobbing + log(folketall), data = skoledata)
reg_skole3 <- lm(lesing ~ mobbing + log(antall), data = skoledata)

reg_skole4 <- lm(lesing ~ mobbing + log(folketall), data = skoledata)

stargazer(reg_skole1, reg_skole2, reg_skole3, type = "text")
## 
## ===========================================================================================
##                                               Dependent variable:                          
##                     -----------------------------------------------------------------------
##                                                     lesing                                 
##                               (1)                     (2)                     (3)          
## -------------------------------------------------------------------------------------------
## mobbing                      0.038                   0.034                   0.032         
##                             (0.031)                 (0.031)                 (0.031)        
##                                                                                            
## log(folketall)              1.336*                 0.552***                                
##                             (0.746)                 (0.106)                                
##                                                                                            
## log(antall)                 -0.765                                         0.512***        
##                             (0.720)                                         (0.102)        
##                                                                                            
## Constant                   40.154***               43.745***               46.389***       
##                             (3.513)                 (0.957)                 (0.489)        
##                                                                                            
## -------------------------------------------------------------------------------------------
## Observations                  288                     288                     288          
## R2                           0.096                   0.092                   0.085         
## Adjusted R2                  0.086                   0.086                   0.079         
## Residual Std. Error    1.960 (df = 284)        1.960 (df = 285)        1.967 (df = 285)    
## F Statistic         10.013*** (df = 3; 284) 14.448*** (df = 2; 285) 13.310*** (df = 2; 285)
## ===========================================================================================
## Note:                                                           *p<0.1; **p<0.05; ***p<0.01

Vi kan estimere en modell med interaksjonsledd ved å bruke “*” i stedet for “+” i lm-funksjonen

reg3 <- lm(y ~ x1 * x2, data = salg)
summary(reg3)
## 
## Call:
## lm(formula = y ~ x1 * x2, data = salg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -77.921 -34.358  -3.609  32.677 103.401 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   78.540     36.841   2.132   0.0384 *  
## x1             5.122      1.032   4.966 9.86e-06 ***
## x2          -124.106     50.891  -2.439   0.0187 *  
## x1:x2          3.811      1.464   2.604   0.0124 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.25 on 46 degrees of freedom
## Multiple R-squared:  0.6833, Adjusted R-squared:  0.6626 
## F-statistic: 33.08 on 3 and 46 DF,  p-value: 1.507e-11

Vi kan legge til polynomledd, for eksempel et andregradsledd, for å beskrive en sammenheng som ikke er lineær:

eksamen <- read_excel("eksamen.xlsx")
plot(eksamen$antall_sider, eksamen$poeng)

reg_eksamen1 <- lm(poeng ~ antall_sider, data = eksamen)
summary(reg1)
## 
## Call:
## lm(formula = y ~ x1, data = salg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -108.720  -33.162    1.282   32.846  105.968 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  15.7712    26.5534   0.594    0.555    
## x1            6.9935     0.7636   9.159 4.15e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49.58 on 48 degrees of freedom
## Multiple R-squared:  0.636,  Adjusted R-squared:  0.6285 
## F-statistic: 83.88 on 1 and 48 DF,  p-value: 4.147e-12
eksamen$antall_sider2 <- eksamen$antall_sider^2
reg_eksamen2 <- lm(poeng ~ antall_sider + antall_sider2, data = eksamen)
summary(reg_eksamen2)
## 
## Call:
## lm(formula = poeng ~ antall_sider + antall_sider2, data = eksamen)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -70.280 -16.328   2.376  15.575  56.720 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   14.08766   15.47647   0.910  0.36373    
## antall_sider   9.95215    2.39927   4.148 4.88e-05 ***
## antall_sider2 -0.25664    0.08895  -2.885  0.00432 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.49 on 209 degrees of freedom
## Multiple R-squared:  0.2241, Adjusted R-squared:  0.2167 
## F-statistic: 30.19 on 2 and 209 DF,  p-value: 3.044e-12

Estimering av faste effekter

df <- read.csv("panel_liten.csv")
head(df)
##      X lnhr lnwg kids age disab id year  mlhr  mlwg   dlhr    dwg
## 1 5241 7.71 3.19    0  30     0  1 1979 7.742 3.218 -0.032 -0.028
## 2 5242 7.64 3.14    0  32     0  1 1980 7.742 3.218 -0.102 -0.078
## 3 5243 7.71 3.11    0  33     0  1 1981 7.742 3.218 -0.032 -0.108
## 4 5244 7.72 3.00    0  34     0  1 1982 7.742 3.218 -0.022 -0.218
## 5 5245 7.72 2.94    0  35     0  1 1983 7.742 3.218 -0.022 -0.278
## 6 5246 7.73 3.12    1  35     0  1 1984 7.742 3.218 -0.012 -0.098

Vanlig regresjon:

reg_vanlig <- lm(lnwg ~ lnhr, data = df)
summary(reg_vanlig)
## 
## Call:
## lm(formula = lnwg ~ lnhr, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44254 -0.16781  0.02993  0.17888  0.53540 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12.7635     2.8173   -4.53    1e-04 ***
## lnhr          2.0412     0.3698    5.52 6.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2411 on 28 degrees of freedom
## Multiple R-squared:  0.5211, Adjusted R-squared:  0.504 
## F-statistic: 30.47 on 1 and 28 DF,  p-value: 6.707e-06
library(plm)
## 
## Attaching package: 'plm'
## The following objects are masked from 'package:dplyr':
## 
##     between, lag, lead

Oversetter til panel-data-frame

p.df <- pdata.frame(df, index = c("id" ,"year"))

reg.fe <- plm(lnwg ~ lnhr,
              data = p.df,
              model = "within")

summary(reg.fe)
## Oneway (individual) effect Within Model
## 
## Call:
## plm(formula = lnwg ~ lnhr, data = p.df, model = "within")
## 
## Balanced Panel: n = 3, T = 10, N = 30
## 
## Residuals:
##      Min.   1st Qu.    Median   3rd Qu.      Max. 
## -0.270016 -0.042538 -0.011427  0.051738  0.336355 
## 
## Coefficients:
##      Estimate Std. Error t-value Pr(>|t|)
## lnhr  0.36293    0.26210  1.3847   0.1779
## 
## Total Sum of Squares:    0.37546
## Residual Sum of Squares: 0.34967
## R-Squared:      0.068678
## Adj. R-Squared: -0.038782
## F-statistic: 1.91731 on 1 and 26 DF, p-value: 0.17792

Hypotesetest for om koeffisientestimatet er statistisk signifikant forskjellig

fra null under antakelsen om at variansen er lik innad i gruppene

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
coeftest(reg.fe, vcovHC(reg.fe, type = 'HC0', cluster = 'group'))
## 
## t test of coefficients:
## 
##      Estimate Std. Error t value Pr(>|t|)
## lnhr  0.36293    0.29224  1.2419   0.2254
fixef(reg.fe)
##         1         2         3 
##  0.408228 -0.062905 -0.279994
plot(reg.fe)