#dataset GaltonFamilies se nalazi u paketu
library(HistData)
GaltonFamilies[1:10,]
## family father mother midparentHeight children childNum gender
## 1 001 78.5 67.0 75.43 4 1 male
## 2 001 78.5 67.0 75.43 4 2 female
## 3 001 78.5 67.0 75.43 4 3 female
## 4 001 78.5 67.0 75.43 4 4 female
## 5 002 75.5 66.5 73.66 4 1 male
## 6 002 75.5 66.5 73.66 4 2 male
## 7 002 75.5 66.5 73.66 4 3 female
## 8 002 75.5 66.5 73.66 4 4 female
## 9 003 75.0 64.0 72.06 2 1 male
## 10 003 75.0 64.0 72.06 2 2 female
## childHeight
## 1 73.2
## 2 69.2
## 3 69.0
## 4 69.0
## 5 73.5
## 6 72.5
## 7 65.5
## 8 65.5
## 9 71.0
## 10 68.0
dim(GaltonFamilies)
## [1] 934 8
#SIMPLE REGRESSION ----
#1. izrada modela
lmmodel=lm(childHeight~midparentHeight,data=GaltonFamilies)
summary (lmmodel)
##
## Call:
## lm(formula = childHeight ~ midparentHeight, data = GaltonFamilies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9570 -2.6989 -0.2155 2.7961 11.6848
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.63624 4.26511 5.307 1.39e-07 ***
## midparentHeight 0.63736 0.06161 10.345 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.392 on 932 degrees of freedom
## Multiple R-squared: 0.103, Adjusted R-squared: 0.102
## F-statistic: 107 on 1 and 932 DF, p-value: < 2.2e-16
# the fitted model is: Predicted child height=22.6362+.6374⋅Midparent height.
library(ggplot2)
ggplot(GaltonFamilies, aes(x=midparentHeight, y=childHeight)) + geom_point() + geom_smooth() #linearna pozitivna veza
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(GaltonFamilies, aes(x=midparentHeight, y=childHeight)) + geom_point() + geom_smooth(method = "lm", se=F) #linearna pozitivna veza

#this is withoud SE, but we know from the previous slide thad se is small, because we have large sample size and small variability
confint(lmmodel)#interval povjerenja
## 2.5 % 97.5 %
## (Intercept) 14.2659135 31.0065676
## midparentHeight 0.5164552 0.7582666
lmmodel$coefficients
## (Intercept) midparentHeight
## 22.6362405 0.6373609
#2. interpretacija modela
#interpretacija: to znači da u prosjeku za svaki dodatni centimetar prosjecnih visina roditelja, ocekujemo da ce se visina djeteta povecati u prosjeku za od 0.637, odnosno kretace se u intervalu od 0.516 do 0.758
# “A unit change in x1 will produce a change of ˆβ1 in the response
##intersept je 22.63, sto znaci kad je prosjek visine roditelja nula, visina djeteta u prosjeku je 22.636incha. Ovo naravno nema smisla
## t stat= coeff/st.error
#kod simple posmatrmo Rsquare a kod multiple adjRsq te p/value
#3 verifikacija modela
#DIAGNOSTIC PLOT----
##**Residuals versus fittet plot----
plot(lmmodel,which=1) #which=1 odnosi se na razlicite vrste plotova, a ima ih 6. jedan oyna;ava residuals vs fitted

#ovjde istrazhujemo homoscedasticity or heteroscedasity
#gledamo linear concept of data. Ako je linija zakrivljena nije linearni model.
#ovaj plot mora biti homeoscedastic.
#na ovom plotu vidimo i outlayere. Izbaci nam broj reda i onda odlucimo da li trebmo da ga izbacimo.
#moramo imati argument za to
lmmodel$coefficients
## (Intercept) midparentHeight
## 22.6362405 0.6373609
confint(lmmodel, level=0.99) # ako zelimo da promijenimo nivo pouzdanosti
## 0.5 % 99.5 %
## (Intercept) 11.6275088 33.6449723
## midparentHeight 0.4783446 0.7963772
##**Cook's distance ----
plot(lmmodel,which=4,id.n = 5) #id.n=5 znači da dobijemo pet rednih brojeva koji su najudaljeniji od fitted value)

##**Other plots----
##qq plot
plot(lmmodel,which=2)

##standardized residuals versus fitted values
plot(lmmodel,which=3)

##MULTIPLE REGRESSION----
#More predictors, x1, x2...xn
lmmodel2=lm(childHeight~mother+father,data=GaltonFamilies)
summary(lmmodel2)
##
## Call:
## lm(formula = childHeight ~ mother + father, data = GaltonFamilies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.117 -2.741 -0.218 2.766 11.694
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.64328 4.26213 5.313 1.35e-07 ***
## mother 0.29051 0.04852 5.987 3.05e-09 ***
## father 0.36828 0.04489 8.204 7.66e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.389 on 931 degrees of freedom
## Multiple R-squared: 0.1052, Adjusted R-squared: 0.1033
## F-statistic: 54.74 on 2 and 931 DF, p-value: < 2.2e-16
#interpretation: “ˆβ1 is the effect of x1 when all the other (specified) predictors are held constant
#fitted model is: Predicted child height=22.6433+.2905*Mother_height+.3683*Father_height.
lmmodel3=lm(childHeight~gender,data=GaltonFamilies)
summary (lmmodel3)
##
## Call:
## lm(formula = childHeight ~ gender, data = GaltonFamilies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.234 -1.604 -0.104 1.766 9.766
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.1040 0.1173 546.32 <2e-16 ***
## gendermale 5.1301 0.1635 31.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.497 on 932 degrees of freedom
## Multiple R-squared: 0.5137, Adjusted R-squared: 0.5132
## F-statistic: 984.4 on 1 and 932 DF, p-value: < 2.2e-16
#fitted model 3 is Height=64.10+5.13⋅Indicator for Male={64.10 if female
#69.23 if male.
#model parents and gender
#plotting the regression
library(ggplot2)
ggplot(aes(x = midparentHeight, y = childHeight, color = gender), data = GaltonFamilies) + geom_smooth(method = "lm") +
geom_point()

#graf dokazuje i koeficijente.. nakrivljenost tj slope tj. beta za musko i zensko nisu razliciti da bi bili signifikantni
lmmodel4a <-lm(childHeight ~ midparentHeight+ gender,data=GaltonFamilies)#separatly
summary (lmmodel4a)
##
## Call:
## lm(formula = childHeight ~ midparentHeight + gender, data = GaltonFamilies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5317 -1.4600 0.0979 1.4566 9.1110
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.51410 2.73392 6.04 2.22e-09 ***
## midparentHeight 0.68702 0.03944 17.42 < 2e-16 ***
## gendermale 5.21511 0.14216 36.69 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.17 on 931 degrees of freedom
## Multiple R-squared: 0.6332, Adjusted R-squared: 0.6324
## F-statistic: 803.6 on 2 and 931 DF, p-value: < 2.2e-16
#INTERACTION TERM
#The interaction effect αβij is interpreted as that part of the mean response not attributable to the additive
#effect of αi and βj. For example, you may enjoy strawberries and cream individually, but the combination
#is superior. In contrast, you may like fish and ice cream separatly but not together
lmmodel4 <- lm(childHeight ~ midparentHeight+ gender + midparentHeight*gender,data=GaltonFamilies) #together
#childHeight = 18.5 + 0.66*midparentHeight + 1.58*gendermale + 0.053*midparentHeight*gendermale
#if there are males = 1 and increase in midparent heigh by 1cm then 0.66 + 1.58*1 +0.053*1*1
0.66+1.58+0.053 #2.293 the increase in the hight by 1cm in midpartentHeight will result to the average increse by 2.293 in sons heights (if we have significance)
## [1] 2.293
#and in case of girls 0
#childheight = 0.066*1 + 1.58*0+0.053*1*0
0.066*1 + 1.58*0+0.053*1*0 #0,06. Increase in the height by 1cma in midparetnt height will result to the average incrase by 0.066 cm in doughter higths
## [1] 0.066
summary(lmmodel4)
##
## Call:
## lm(formula = childHeight ~ midparentHeight + gender + midparentHeight *
## gender, data = GaltonFamilies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5431 -1.4568 0.0769 1.4795 9.0860
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.33348 3.86636 4.742 2.45e-06 ***
## midparentHeight 0.66075 0.05580 11.842 < 2e-16 ***
## gendermale 1.57998 5.46264 0.289 0.772
## midparentHeight:gendermale 0.05252 0.07890 0.666 0.506
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.171 on 930 degrees of freedom
## Multiple R-squared: 0.6334, Adjusted R-squared: 0.6322
## F-statistic: 535.6 on 3 and 930 DF, p-value: < 2.2e-16
#its is like fish and ice/cream
#Diagnosis of lmmodel4a
ggplot(GaltonFamilies, aes(y = midparentHeight,x= gender)) + geom_boxplot() #we have some outliers we can decide shall we extract them or not

summary (lmmodel4a)
##
## Call:
## lm(formula = childHeight ~ midparentHeight + gender, data = GaltonFamilies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5317 -1.4600 0.0979 1.4566 9.1110
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.51410 2.73392 6.04 2.22e-09 ***
## midparentHeight 0.68702 0.03944 17.42 < 2e-16 ***
## gendermale 5.21511 0.14216 36.69 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.17 on 931 degrees of freedom
## Multiple R-squared: 0.6332, Adjusted R-squared: 0.6324
## F-statistic: 803.6 on 2 and 931 DF, p-value: < 2.2e-16
#the model is y=16.514 + 0.687midparentHeight + 5.2151gendermale
#f statistics: An F statistic is a value you get when you run a regression analysis to find out if the means between two populations are significantly different.
#It’s similar to a T statistic from a T-Test; A-T test will tell you if a single variable is statistically significant and an F test will tell you if a group of variables are jointly significant.
confint(lmmodel4a)
## 2.5 % 97.5 %
## (Intercept) 11.1487472 21.8794572
## midparentHeight 0.6096139 0.7644165
## gendermale 4.9361179 5.4940929
#Diagnostics
plot(lmmodel4a, which=1) #they are desperzed around zero, we see the data that is "ruining" our model

#lets do correlation
vars <- c("midparentHeight", "childHeight", "mother", "father")
cor(GaltonFamilies[,vars])
## midparentHeight childHeight mother father
## midparentHeight 1.0000000 0.3209499 0.72783397 0.72843929
## childHeight 0.3209499 1.0000000 0.20132195 0.26603854
## mother 0.7278340 0.2013219 1.00000000 0.06036612
## father 0.7284393 0.2660385 0.06036612 1.00000000
#we see that there is a big correlation between midparentHeight and mother and father. This means we can not
#use in one model those variables togeter
#Test the initial correlations
#install.packages("psych")
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
vars <- c("midparentHeight", "childHeight", "mother", "father")
corr.test(GaltonFamilies[,vars]) #It allows for all complete data to be used in the correlations
## Call:corr.test(x = GaltonFamilies[, vars])
## Correlation matrix
## midparentHeight childHeight mother father
## midparentHeight 1.00 0.32 0.73 0.73
## childHeight 0.32 1.00 0.20 0.27
## mother 0.73 0.20 1.00 0.06
## father 0.73 0.27 0.06 1.00
## Sample Size
## [1] 934
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## midparentHeight childHeight mother father
## midparentHeight 0 0 0.00 0.00
## childHeight 0 0 0.00 0.00
## mother 0 0 0.00 0.07
## father 0 0 0.07 0.00
##
## To see confidence intervals of the correlations, print with the short=FALSE option
#VARIANCE INFLATION FACTOR
library (SparseM) #prvo sparsM pa car
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library (car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
vif(lmmodel4a) #vif je variance Inflation Factors. koristimo da objasnimo koliko multicollinearity (correlation between predictors) postoji u regresionoj analizi.
## midparentHeight gender
## 1.001179 1.001179
#High VIFs are a sign of multicollinearity.vif je na neki nacin mjera nepreciznosti tako da je npr ako je
##VIF 4 za estimated koeficijente je 4x veca za koeficijente radi correlation between the two independent variables.
#rule of thumb:
##VIF = 1 (Not correlated)
##1 < VIF < 5 (Moderately correlated)
##VIF >=5 (Highly correlated)
#sa nasim rezultatom smo ok VIF je oko 1
##DISPLAYING REGRESSION RESULTS
#displaying regression results
library(xtable)
library(texreg)
## Version: 1.36.23
## Date: 2017-03-03
## Author: Philip Leifeld (University of Glasgow)
##
## Please cite the JSS article in your publications -- see citation("texreg").
screenreg(list(lmmodel, lmmodel2, lmmodel3, lmmodel4, lmmodel4a))
##
## ======================================================================================
## Model 1 Model 2 Model 3 Model 4 Model 5
## --------------------------------------------------------------------------------------
## (Intercept) 22.64 *** 22.64 *** 64.10 *** 18.33 *** 16.51 ***
## (4.27) (4.26) (0.12) (3.87) (2.73)
## midparentHeight 0.64 *** 0.66 *** 0.69 ***
## (0.06) (0.06) (0.04)
## mother 0.29 ***
## (0.05)
## father 0.37 ***
## (0.04)
## gendermale 5.13 *** 1.58 5.22 ***
## (0.16) (5.46) (0.14)
## midparentHeight:gendermale 0.05
## (0.08)
## --------------------------------------------------------------------------------------
## R^2 0.10 0.11 0.51 0.63 0.63
## Adj. R^2 0.10 0.10 0.51 0.63 0.63
## Num. obs. 934 934 934 934 934
## RMSE 3.39 3.39 2.50 2.17 2.17
## ======================================================================================
## *** p < 0.001, ** p < 0.01, * p < 0.05