podatki <- read.table("./PZD.csv", header=TRUE, sep=";", dec=",")
head(podatki, 10) #Prikažemo prvih 10 opazovanj
## Drzava PZD Alkohol Solanje Teza HIV Razvoj
## 1 Algeria 76.23 0.58 7.8 27.4 0.03 1
## 2 Angola 62.22 5.94 5.0 8.2 1.92 1
## 3 Argentina 76.77 8.43 9.8 28.3 0.71 1
## 4 Australia 82.58 9.68 13.2 29.0 0.07 0
## 5 Austria 81.37 11.60 11.3 20.1 0.22 0
## 6 Bahamas 75.57 8.71 10.9 31.6 1.66 1
## 7 Barbados 75.46 9.00 10.5 23.1 0.73 1
## 8 Belarus 73.84 9.72 12.0 24.5 0.28 1
## 9 Belgium 80.93 10.36 11.4 24.5 0.20 0
## 10 Belize 70.25 6.26 10.5 24.1 1.04 1
Opis spremenljivk:
library(pastecs)
round(stat.desc(podatki[-1]), 2)
## PZD Alkohol Solanje Teza HIV Razvoj
## nbr.val 90.00 90.00 90.00 90.00 90.00 90.00
## nbr.null 0.00 0.00 0.00 0.00 0.00 28.00
## nbr.na 0.00 0.00 0.00 0.00 0.00 0.00
## min 52.11 0.01 1.40 2.10 0.01 0.00
## max 83.96 16.70 13.30 47.30 29.19 1.00
## range 31.85 16.69 11.90 45.20 29.18 1.00
## sum 6652.94 575.65 859.50 1851.50 210.79 62.00
## median 75.47 6.95 10.20 21.80 0.26 1.00
## mean 73.92 6.40 9.55 20.57 2.34 0.69
## SE.mean 0.78 0.39 0.29 0.91 0.61 0.05
## CI.mean.0.95 1.56 0.78 0.58 1.80 1.22 0.10
## var 55.19 13.91 7.77 74.14 33.66 0.22
## std.dev 7.43 3.73 2.79 8.61 5.80 0.47
## coef.var 0.10 0.58 0.29 0.42 2.48 0.68
library(car)
scatterplotMatrix(podatki[ , c(-1, -7)], #Matrika razsevnih grafikonov
smooth = FALSE)
library(Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
rcorr(as.matrix(podatki[ , c(-1, -7)])) #Izpišemo korelacijsko matriko
## PZD Alkohol Solanje Teza HIV
## PZD 1.00 0.27 0.72 0.35 -0.65
## Alkohol 0.27 1.00 0.50 0.06 -0.13
## Solanje 0.72 0.50 1.00 0.42 -0.34
## Teza 0.35 0.06 0.42 1.00 -0.20
## HIV -0.65 -0.13 -0.34 -0.20 1.00
##
## n= 90
##
##
## P
## PZD Alkohol Solanje Teza HIV
## PZD 0.0098 0.0000 0.0008 0.0000
## Alkohol 0.0098 0.0000 0.5990 0.2173
## Solanje 0.0000 0.0000 0.0000 0.0011
## Teza 0.0008 0.5990 0.0000 0.0547
## HIV 0.0000 0.2173 0.0011 0.0547
fit1 <- lm(PZD ~ Alkohol + Solanje + Teza + HIV,
data = podatki) #Navedemo odvisno ter pojasnjevalne spremenljivke in ime tabele s podatki
vif(fit1) #Multikolinearnost
## Alkohol Solanje Teza HIV
## 1.392887 1.802436 1.264037 1.136844
mean(vif(fit1))
## [1] 1.399051
podatki$StdOstanki <- round(rstandard(fit1), 3) #Shranimo standardizirane ostanke
podatki$CooksD <- round(cooks.distance(fit1), 3) #Shranimo Cookove razdalje
hist(podatki$StdOstanki,
xlab = "Standardizirani ostanki",
ylab = "Frekvenca",
main = "Histrogram standardiziranih ostankov")
shapiro.test(podatki$StdOstanki)
##
## Shapiro-Wilk normality test
##
## data: podatki$StdOstanki
## W = 0.97754, p-value = 0.1213
hist(podatki$CooksD,
xlab = "Cookove razdalje",
ylab = "Frekvenca",
main = "Histrogram Cookovih razdalj")
head(podatki[order(podatki$StdOstanki),], 3) #Prikaz treh enot z najnižjo vrednostjo standardiziranih ostankov.
## Drzava PZD Alkohol Solanje Teza HIV Razvoj StdOstanki CooksD
## 60 Nigeria 54.78 7.83 6.0 8.9 3.11 1 -3.127 0.108
## 44 Kazakhstan 70.57 3.30 11.7 21.0 0.11 1 -2.157 0.043
## 86 Uzbekistan 72.09 1.59 12.0 16.6 0.08 1 -2.021 0.076
head(podatki[order(-podatki$CooksD),], 6) #Prikaz šestih enot z najvišjo vrednostjo Cookovih razdalj.
## Drzava PZD Alkohol Solanje Teza HIV Razvoj StdOstanki CooksD
## 60 Nigeria 54.78 7.83 6.0 8.9 3.11 1 -3.127 0.108
## 56 Namibia 63.38 4.17 6.7 17.2 23.87 1 1.612 0.105
## 52 Micronesia 69.27 1.59 9.7 45.8 0.76 1 -1.792 0.101
## 86 Uzbekistan 72.09 1.59 12.0 16.6 0.08 1 -2.021 0.076
## 79 Swaziland 57.35 7.29 6.8 16.5 29.19 1 0.999 0.071
## 47 Lesotho 52.11 2.81 6.1 16.6 23.80 1 -1.196 0.058
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
podatki <- podatki %>%
filter(!Drzava == "Nigeria") #Odstranimo Nigerijo
fit1 <- lm(PZD ~ Alkohol + Solanje + Teza + HIV,
data = podatki)
podatki$StdOstanki <- round(rstandard(fit1), 3) #Shranimo standardizirane ostanke
podatki$CooksD <- round(cooks.distance(fit1), 3) #Shranimo Cookove razdalje
hist(podatki$StdOstanki,
xlab = "Standardizirani ostanki",
ylab = "Frekvenca",
main = "Histrogram standardiziranih ostankov")
shapiro.test(podatki$StdOstanki)
##
## Shapiro-Wilk normality test
##
## data: podatki$StdOstanki
## W = 0.97413, p-value = 0.07219
hist(podatki$CooksD,
xlab = "Cookove razdalje",
ylab = "Frekvenca",
main = "Histrogram Cookovih razdalj")
podatki$StdOcenVred <- scale(fit1$fitted.values)
library(car)
scatterplot(y = podatki$StdOstanki, x = podatki$StdOcenVred,
ylab = "Standardizirani ostanki",
xlab = "Standardizirane ocenjene vrednosti",
boxplots = FALSE,
regLine = FALSE,
smooth = FALSE)
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
ols_test_breusch_pagan(fit1)
##
## Breusch Pagan Test for Heteroskedasticity
## -----------------------------------------
## Ho: the variance is constant
## Ha: the variance is not constant
##
## Data
## -------------------------------
## Response : PZD
## Variables: fitted values of PZD
##
## Test Summary
## ----------------------------
## DF = 1
## Chi2 = 0.7854664
## Prob > Chi2 = 0.3754745
summary(fit1) #Rezultati regresijske analize
##
## Call:
## lm(formula = PZD ~ Alkohol + Solanje + Teza + HIV, data = podatki)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5356 -2.6940 0.6189 2.7663 8.7668
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.850621 1.732141 35.708 < 2e-16 ***
## Alkohol -0.140182 0.132072 -1.061 0.292
## Solanje 1.532527 0.201573 7.603 3.76e-11 ***
## Teza -0.007066 0.054310 -0.130 0.897
## HIV -0.587151 0.076287 -7.697 2.45e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.912 on 84 degrees of freedom
## Multiple R-squared: 0.7169, Adjusted R-squared: 0.7034
## F-statistic: 53.18 on 4 and 84 DF, p-value: < 2.2e-16
sqrt(summary(fit1)$r.squared) #Multipli korelacijski koeficient
## [1] 0.8467003
library(ppcor)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:olsrr':
##
## cement
## The following object is masked from 'package:dplyr':
##
## select
parc_kor <- pcor(podatki[ ,c("PZD", "Alkohol", "Solanje", "Teza", "HIV")]) #Izračun parcialnih korelacijskih koeficientov
round(parc_kor$estimate, 2)
## PZD Alkohol Solanje Teza HIV
## PZD 1.00 -0.12 0.64 -0.01 -0.64
## Alkohol -0.12 1.00 0.48 -0.18 -0.04
## Solanje 0.64 0.48 1.00 0.32 0.25
## Teza -0.01 -0.18 0.32 1.00 -0.06
## HIV -0.64 -0.04 0.25 -0.06 1.00
round(parc_kor$p.value, 3)
## PZD Alkohol Solanje Teza HIV
## PZD 0.000 0.292 0.000 0.897 0.000
## Alkohol 0.292 0.000 0.000 0.093 0.687
## Solanje 0.000 0.000 0.000 0.003 0.018
## Teza 0.897 0.093 0.003 0.000 0.577
## HIV 0.000 0.687 0.018 0.577 0.000
library(lm.beta)
lm.beta(fit1)
##
## Call:
## lm(formula = PZD ~ Alkohol + Solanje + Teza + HIV, data = podatki)
##
## Standardized Coefficients::
## (Intercept) Alkohol Solanje Teza HIV
## NA -0.073127103 0.592684954 -0.008427659 -0.476860987
podatki$RazvojF <- factor(podatki$Razvoj,
levels = c(0, 1),
labels = c("Razvita drzava", "Drzava v razvoju"))
fit2 <- lm(PZD ~ Alkohol + Solanje + Teza + HIV + RazvojF,
data = podatki)
summary(fit2)
##
## Call:
## lm(formula = PZD ~ Alkohol + Solanje + Teza + HIV + RazvojF,
## data = podatki)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.2078 -2.6832 0.8008 2.4541 6.4282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.10156 2.45033 28.609 < 2e-16 ***
## Alkohol -0.40695 0.13428 -3.031 0.00325 **
## Solanje 1.13724 0.20373 5.582 2.92e-07 ***
## Teza 0.03061 0.04997 0.613 0.54184
## HIV -0.56003 0.06942 -8.067 4.79e-12 ***
## RazvojFDrzava v razvoju -5.25478 1.19810 -4.386 3.37e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.546 on 83 degrees of freedom
## Multiple R-squared: 0.7702, Adjusted R-squared: 0.7563
## F-statistic: 55.63 on 5 and 83 DF, p-value: < 2.2e-16
anova(fit1, fit2) #Primerjava dveh regresijskih funkcij, ki se razlikujeta v številu pojasnjevalnih spremenljivk
## Analysis of Variance Table
##
## Model 1: PZD ~ Alkohol + Solanje + Teza + HIV
## Model 2: PZD ~ Alkohol + Solanje + Teza + HIV + RazvojF
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 84 1285.6
## 2 83 1043.7 1 241.9 19.236 3.366e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
podatki$OcenVred <- fitted.values(fit2) #Ocenjene vrednosti na podlagi funkcije fit2
podatki$Ostanek <- residuals(fit2) #Izračun ostankov na podlagi funkcije fit2
head(podatki[colnames(podatki) %in% c("Drzava", "PZD", "OcenVred", "Ostanek")]) #Prikaz tabele s podatki z izbranimi spremenljivkami
## Drzava PZD OcenVred Ostanek
## 1 Algeria 76.23 74.30322 1.9267770
## 2 Angola 62.22 67.29150 -5.0714988
## 3 Argentina 76.77 73.02991 3.7400912
## 4 Australia 82.58 82.02248 0.5575169
## 5 Austria 81.37 78.72393 2.6460670
## 6 Bahamas 75.57 73.73592 1.8340811