podatki <- read.table("./PZD.csv", header=TRUE, sep=";", dec=",")

head(podatki, 10) #Prikažemo prvih 10 opazovanj
##       Drzava   PZD Alkohol Solanje Teza  HIV Razvoj
## 1    Algeria 76.23    0.58     7.8 27.4 0.03      1
## 2     Angola 62.22    5.94     5.0  8.2 1.92      1
## 3  Argentina 76.77    8.43     9.8 28.3 0.71      1
## 4  Australia 82.58    9.68    13.2 29.0 0.07      0
## 5    Austria 81.37   11.60    11.3 20.1 0.22      0
## 6    Bahamas 75.57    8.71    10.9 31.6 1.66      1
## 7   Barbados 75.46    9.00    10.5 23.1 0.73      1
## 8    Belarus 73.84    9.72    12.0 24.5 0.28      1
## 9    Belgium 80.93   10.36    11.4 24.5 0.20      0
## 10    Belize 70.25    6.26    10.5 24.1 1.04      1

Opis spremenljivk:

library(pastecs)
round(stat.desc(podatki[-1]), 2)
##                  PZD Alkohol Solanje    Teza    HIV Razvoj
## nbr.val        90.00   90.00   90.00   90.00  90.00  90.00
## nbr.null        0.00    0.00    0.00    0.00   0.00  28.00
## nbr.na          0.00    0.00    0.00    0.00   0.00   0.00
## min            52.11    0.01    1.40    2.10   0.01   0.00
## max            83.96   16.70   13.30   47.30  29.19   1.00
## range          31.85   16.69   11.90   45.20  29.18   1.00
## sum          6652.94  575.65  859.50 1851.50 210.79  62.00
## median         75.47    6.95   10.20   21.80   0.26   1.00
## mean           73.92    6.40    9.55   20.57   2.34   0.69
## SE.mean         0.78    0.39    0.29    0.91   0.61   0.05
## CI.mean.0.95    1.56    0.78    0.58    1.80   1.22   0.10
## var            55.19   13.91    7.77   74.14  33.66   0.22
## std.dev         7.43    3.73    2.79    8.61   5.80   0.47
## coef.var        0.10    0.58    0.29    0.42   2.48   0.68
library(car)
scatterplotMatrix(podatki[ , c(-1, -7)],  #Matrika razsevnih grafikonov
                  smooth = FALSE) 

library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
rcorr(as.matrix(podatki[ , c(-1, -7)])) #Izpišemo korelacijsko matriko
##           PZD Alkohol Solanje  Teza   HIV
## PZD      1.00    0.27    0.72  0.35 -0.65
## Alkohol  0.27    1.00    0.50  0.06 -0.13
## Solanje  0.72    0.50    1.00  0.42 -0.34
## Teza     0.35    0.06    0.42  1.00 -0.20
## HIV     -0.65   -0.13   -0.34 -0.20  1.00
## 
## n= 90 
## 
## 
## P
##         PZD    Alkohol Solanje Teza   HIV   
## PZD            0.0098  0.0000  0.0008 0.0000
## Alkohol 0.0098         0.0000  0.5990 0.2173
## Solanje 0.0000 0.0000          0.0000 0.0011
## Teza    0.0008 0.5990  0.0000         0.0547
## HIV     0.0000 0.2173  0.0011  0.0547
fit1 <- lm(PZD ~ Alkohol + Solanje + Teza + HIV,
           data = podatki) #Navedemo odvisno ter pojasnjevalne spremenljivke in ime tabele s podatki

vif(fit1) #Multikolinearnost
##  Alkohol  Solanje     Teza      HIV 
## 1.392887 1.802436 1.264037 1.136844
mean(vif(fit1))
## [1] 1.399051
podatki$StdOstanki <- round(rstandard(fit1), 3) #Shranimo standardizirane ostanke
podatki$CooksD <- round(cooks.distance(fit1), 3) #Shranimo Cookove razdalje

hist(podatki$StdOstanki, 
     xlab = "Standardizirani ostanki", 
     ylab = "Frekvenca", 
     main = "Histrogram standardiziranih ostankov")

shapiro.test(podatki$StdOstanki)
## 
##  Shapiro-Wilk normality test
## 
## data:  podatki$StdOstanki
## W = 0.97754, p-value = 0.1213
hist(podatki$CooksD, 
     xlab = "Cookove razdalje", 
     ylab = "Frekvenca", 
     main = "Histrogram Cookovih razdalj")

head(podatki[order(podatki$StdOstanki),], 3) #Prikaz treh enot z najnižjo vrednostjo standardiziranih ostankov.
##        Drzava   PZD Alkohol Solanje Teza  HIV Razvoj StdOstanki CooksD
## 60    Nigeria 54.78    7.83     6.0  8.9 3.11      1     -3.127  0.108
## 44 Kazakhstan 70.57    3.30    11.7 21.0 0.11      1     -2.157  0.043
## 86 Uzbekistan 72.09    1.59    12.0 16.6 0.08      1     -2.021  0.076
head(podatki[order(-podatki$CooksD),], 6) #Prikaz šestih enot z najvišjo vrednostjo Cookovih razdalj.
##        Drzava   PZD Alkohol Solanje Teza   HIV Razvoj StdOstanki CooksD
## 60    Nigeria 54.78    7.83     6.0  8.9  3.11      1     -3.127  0.108
## 56    Namibia 63.38    4.17     6.7 17.2 23.87      1      1.612  0.105
## 52 Micronesia 69.27    1.59     9.7 45.8  0.76      1     -1.792  0.101
## 86 Uzbekistan 72.09    1.59    12.0 16.6  0.08      1     -2.021  0.076
## 79  Swaziland 57.35    7.29     6.8 16.5 29.19      1      0.999  0.071
## 47    Lesotho 52.11    2.81     6.1 16.6 23.80      1     -1.196  0.058
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize
## The following objects are masked from 'package:pastecs':
## 
##     first, last
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
podatki <- podatki %>%
  filter(!Drzava == "Nigeria") #Odstranimo Nigerijo
fit1 <- lm(PZD ~ Alkohol + Solanje + Teza + HIV, 
           data = podatki)
podatki$StdOstanki <- round(rstandard(fit1), 3) #Shranimo standardizirane ostanke
podatki$CooksD <- round(cooks.distance(fit1), 3) #Shranimo Cookove razdalje

hist(podatki$StdOstanki, 
     xlab = "Standardizirani ostanki", 
     ylab = "Frekvenca", 
     main = "Histrogram standardiziranih ostankov")

shapiro.test(podatki$StdOstanki)
## 
##  Shapiro-Wilk normality test
## 
## data:  podatki$StdOstanki
## W = 0.97413, p-value = 0.07219
hist(podatki$CooksD, 
     xlab = "Cookove razdalje", 
     ylab = "Frekvenca", 
     main = "Histrogram Cookovih razdalj")

podatki$StdOcenVred <- scale(fit1$fitted.values)

library(car)
scatterplot(y = podatki$StdOstanki, x = podatki$StdOcenVred,
            ylab = "Standardizirani ostanki",
            xlab = "Standardizirane ocenjene vrednosti",
            boxplots = FALSE,
            regLine = FALSE,
            smooth = FALSE)

library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
ols_test_breusch_pagan(fit1)
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##              Data               
##  -------------------------------
##  Response : PZD 
##  Variables: fitted values of PZD 
## 
##         Test Summary         
##  ----------------------------
##  DF            =    1 
##  Chi2          =    0.7854664 
##  Prob > Chi2   =    0.3754745
summary(fit1) #Rezultati regresijske analize
## 
## Call:
## lm(formula = PZD ~ Alkohol + Solanje + Teza + HIV, data = podatki)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5356 -2.6940  0.6189  2.7663  8.7668 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 61.850621   1.732141  35.708  < 2e-16 ***
## Alkohol     -0.140182   0.132072  -1.061    0.292    
## Solanje      1.532527   0.201573   7.603 3.76e-11 ***
## Teza        -0.007066   0.054310  -0.130    0.897    
## HIV         -0.587151   0.076287  -7.697 2.45e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.912 on 84 degrees of freedom
## Multiple R-squared:  0.7169, Adjusted R-squared:  0.7034 
## F-statistic: 53.18 on 4 and 84 DF,  p-value: < 2.2e-16
sqrt(summary(fit1)$r.squared) #Multipli korelacijski koeficient
## [1] 0.8467003
library(ppcor)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:olsrr':
## 
##     cement
## The following object is masked from 'package:dplyr':
## 
##     select
parc_kor <- pcor(podatki[ ,c("PZD", "Alkohol", "Solanje", "Teza", "HIV")]) #Izračun parcialnih korelacijskih koeficientov
round(parc_kor$estimate, 2)
##           PZD Alkohol Solanje  Teza   HIV
## PZD      1.00   -0.12    0.64 -0.01 -0.64
## Alkohol -0.12    1.00    0.48 -0.18 -0.04
## Solanje  0.64    0.48    1.00  0.32  0.25
## Teza    -0.01   -0.18    0.32  1.00 -0.06
## HIV     -0.64   -0.04    0.25 -0.06  1.00
round(parc_kor$p.value, 3)
##           PZD Alkohol Solanje  Teza   HIV
## PZD     0.000   0.292   0.000 0.897 0.000
## Alkohol 0.292   0.000   0.000 0.093 0.687
## Solanje 0.000   0.000   0.000 0.003 0.018
## Teza    0.897   0.093   0.003 0.000 0.577
## HIV     0.000   0.687   0.018 0.577 0.000
library(lm.beta)
lm.beta(fit1)
## 
## Call:
## lm(formula = PZD ~ Alkohol + Solanje + Teza + HIV, data = podatki)
## 
## Standardized Coefficients::
##  (Intercept)      Alkohol      Solanje         Teza          HIV 
##           NA -0.073127103  0.592684954 -0.008427659 -0.476860987
podatki$RazvojF <- factor(podatki$Razvoj,
                          levels = c(0, 1),  
                          labels = c("Razvita drzava", "Drzava v razvoju")) 

fit2 <- lm(PZD ~ Alkohol + Solanje + Teza + HIV + RazvojF, 
           data = podatki)

summary(fit2)
## 
## Call:
## lm(formula = PZD ~ Alkohol + Solanje + Teza + HIV + RazvojF, 
##     data = podatki)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.2078 -2.6832  0.8008  2.4541  6.4282 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             70.10156    2.45033  28.609  < 2e-16 ***
## Alkohol                 -0.40695    0.13428  -3.031  0.00325 ** 
## Solanje                  1.13724    0.20373   5.582 2.92e-07 ***
## Teza                     0.03061    0.04997   0.613  0.54184    
## HIV                     -0.56003    0.06942  -8.067 4.79e-12 ***
## RazvojFDrzava v razvoju -5.25478    1.19810  -4.386 3.37e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.546 on 83 degrees of freedom
## Multiple R-squared:  0.7702, Adjusted R-squared:  0.7563 
## F-statistic: 55.63 on 5 and 83 DF,  p-value: < 2.2e-16
anova(fit1, fit2) #Primerjava dveh regresijskih funkcij, ki se razlikujeta v številu pojasnjevalnih spremenljivk
## Analysis of Variance Table
## 
## Model 1: PZD ~ Alkohol + Solanje + Teza + HIV
## Model 2: PZD ~ Alkohol + Solanje + Teza + HIV + RazvojF
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1     84 1285.6                                  
## 2     83 1043.7  1     241.9 19.236 3.366e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
podatki$OcenVred <- fitted.values(fit2) #Ocenjene vrednosti na podlagi funkcije fit2
podatki$Ostanek <- residuals(fit2) #Izračun ostankov na podlagi funkcije fit2
head(podatki[colnames(podatki) %in% c("Drzava", "PZD", "OcenVred", "Ostanek")]) #Prikaz tabele s podatki z izbranimi spremenljivkami
##      Drzava   PZD OcenVred    Ostanek
## 1   Algeria 76.23 74.30322  1.9267770
## 2    Angola 62.22 67.29150 -5.0714988
## 3 Argentina 76.77 73.02991  3.7400912
## 4 Australia 82.58 82.02248  0.5575169
## 5   Austria 81.37 78.72393  2.6460670
## 6   Bahamas 75.57 73.73592  1.8340811