#dataset GaltonFamilies se nalazi u paketu 
library(HistData)

GaltonFamilies[1:10,]
##    family father mother midparentHeight children childNum gender
## 1     001   78.5   67.0           75.43        4        1   male
## 2     001   78.5   67.0           75.43        4        2 female
## 3     001   78.5   67.0           75.43        4        3 female
## 4     001   78.5   67.0           75.43        4        4 female
## 5     002   75.5   66.5           73.66        4        1   male
## 6     002   75.5   66.5           73.66        4        2   male
## 7     002   75.5   66.5           73.66        4        3 female
## 8     002   75.5   66.5           73.66        4        4 female
## 9     003   75.0   64.0           72.06        2        1   male
## 10    003   75.0   64.0           72.06        2        2 female
##    childHeight
## 1         73.2
## 2         69.2
## 3         69.0
## 4         69.0
## 5         73.5
## 6         72.5
## 7         65.5
## 8         65.5
## 9         71.0
## 10        68.0
dim(GaltonFamilies)
## [1] 934   8
#SIMPLE REGRESSION ----
#1. izrada modela
lmmodel=lm(childHeight~midparentHeight,data=GaltonFamilies)
summary (lmmodel)
## 
## Call:
## lm(formula = childHeight ~ midparentHeight, data = GaltonFamilies)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.9570 -2.6989 -0.2155  2.7961 11.6848 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     22.63624    4.26511   5.307 1.39e-07 ***
## midparentHeight  0.63736    0.06161  10.345  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.392 on 932 degrees of freedom
## Multiple R-squared:  0.103,  Adjusted R-squared:  0.102 
## F-statistic:   107 on 1 and 932 DF,  p-value: < 2.2e-16
# the fitted model is: Predicted child height=22.6362+.6374⋅Midparent height.

library(ggplot2)
ggplot(GaltonFamilies, aes(x=midparentHeight, y=childHeight)) + geom_point() + geom_smooth() #linearna pozitivna veza
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot(GaltonFamilies, aes(x=midparentHeight, y=childHeight)) + geom_point() + geom_smooth(method = "lm", se=F) #linearna pozitivna veza

#this is withoud SE, but we know from the previous slide thad se is small, because we have large sample size and small variability

confint(lmmodel)#interval povjerenja
##                      2.5 %     97.5 %
## (Intercept)     14.2659135 31.0065676
## midparentHeight  0.5164552  0.7582666
lmmodel$coefficients
##     (Intercept) midparentHeight 
##      22.6362405       0.6373609
#2. interpretacija modela 
#interpretacija: to znači da u prosjeku za svaki dodatni centimetar prosjecnih visina roditelja, ocekujemo da ce se visina djeteta povecati u prosjeku za od 0.637, odnosno kretace se u intervalu od 0.516 do 0.758 
# “A unit change in x1 will produce a change of ˆβ1 in the response
##intersept je 22.63, sto znaci kad je prosjek visine roditelja nula, visina djeteta u prosjeku je 22.636incha. Ovo naravno nema smisla

## t stat= coeff/st.error
#kod simple posmatrmo Rsquare a kod multiple adjRsq te p/value

#3 verifikacija modela

#DIAGNOSTIC PLOT----
##**Residuals versus fittet plot----
plot(lmmodel,which=1) #which=1 odnosi se na razlicite vrste plotova, a ima ih 6. jedan oyna;ava residuals vs fitted

#ovjde istrazhujemo homoscedasticity or heteroscedasity
#gledamo linear concept of data. Ako je linija zakrivljena nije linearni model.
#ovaj plot mora biti homeoscedastic.
#na ovom plotu vidimo i outlayere. Izbaci nam broj reda i onda odlucimo da li trebmo da ga izbacimo.
#moramo imati argument za to

lmmodel$coefficients
##     (Intercept) midparentHeight 
##      22.6362405       0.6373609
confint(lmmodel, level=0.99) # ako zelimo da promijenimo nivo pouzdanosti
##                      0.5 %     99.5 %
## (Intercept)     11.6275088 33.6449723
## midparentHeight  0.4783446  0.7963772
##**Cook's distance ----

plot(lmmodel,which=4,id.n = 5) #id.n=5  znači da dobijemo pet rednih brojeva koji su najudaljeniji od fitted value)

##**Other plots---- 

##qq plot
plot(lmmodel,which=2)

##standardized residuals versus fitted values
plot(lmmodel,which=3) 

##MULTIPLE REGRESSION----
#More predictors, x1, x2...xn
lmmodel2=lm(childHeight~mother+father,data=GaltonFamilies)
summary(lmmodel2)
## 
## Call:
## lm(formula = childHeight ~ mother + father, data = GaltonFamilies)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.117 -2.741 -0.218  2.766 11.694 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 22.64328    4.26213   5.313 1.35e-07 ***
## mother       0.29051    0.04852   5.987 3.05e-09 ***
## father       0.36828    0.04489   8.204 7.66e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.389 on 931 degrees of freedom
## Multiple R-squared:  0.1052, Adjusted R-squared:  0.1033 
## F-statistic: 54.74 on 2 and 931 DF,  p-value: < 2.2e-16
#interpretation: “ˆβ1 is the effect of x1 when all the other (specified) predictors are held constant

#fitted model is: Predicted child height=22.6433+.2905*Mother_height+.3683*Father_height.

lmmodel3=lm(childHeight~gender,data=GaltonFamilies)
summary (lmmodel3)
## 
## Call:
## lm(formula = childHeight ~ gender, data = GaltonFamilies)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.234 -1.604 -0.104  1.766  9.766 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  64.1040     0.1173  546.32   <2e-16 ***
## gendermale    5.1301     0.1635   31.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.497 on 932 degrees of freedom
## Multiple R-squared:  0.5137, Adjusted R-squared:  0.5132 
## F-statistic: 984.4 on 1 and 932 DF,  p-value: < 2.2e-16
#fitted model 3 is Height=64.10+5.13⋅Indicator for Male={64.10 if female
                                                       #69.23 if male.

#model parents and gender

#plotting the regression
library(ggplot2)
ggplot(aes(x = midparentHeight, y = childHeight, color = gender), data = GaltonFamilies) + geom_smooth(method = "lm") +
  geom_point()

#graf dokazuje i koeficijente.. nakrivljenost tj slope tj. beta za musko i zensko nisu razliciti da bi bili signifikantni


lmmodel4a <-lm(childHeight ~ midparentHeight+ gender,data=GaltonFamilies)#separatly
summary (lmmodel4a) 
## 
## Call:
## lm(formula = childHeight ~ midparentHeight + gender, data = GaltonFamilies)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5317 -1.4600  0.0979  1.4566  9.1110 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     16.51410    2.73392    6.04 2.22e-09 ***
## midparentHeight  0.68702    0.03944   17.42  < 2e-16 ***
## gendermale       5.21511    0.14216   36.69  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.17 on 931 degrees of freedom
## Multiple R-squared:  0.6332, Adjusted R-squared:  0.6324 
## F-statistic: 803.6 on 2 and 931 DF,  p-value: < 2.2e-16
#INTERACTION TERM
#The interaction effect αβij is interpreted as that part of the mean response not attributable to the additive
#effect of αi and βj. For example, you may enjoy strawberries and cream individually, but the combination
#is superior. In contrast, you may like fish and ice cream separatly but not together

lmmodel4 <- lm(childHeight ~ midparentHeight+ gender + midparentHeight*gender,data=GaltonFamilies) #together

#childHeight = 18.5 + 0.66*midparentHeight + 1.58*gendermale + 0.053*midparentHeight*gendermale
#if there are males = 1 and increase in midparent heigh by 1cm then 0.66 + 1.58*1 +0.053*1*1

0.66+1.58+0.053 #2.293 the increase in the hight by 1cm in midpartentHeight will result to the average increse by 2.293 in sons heights (if we have significance)
## [1] 2.293
#and in case of girls 0

#childheight = 0.066*1 + 1.58*0+0.053*1*0
0.066*1 + 1.58*0+0.053*1*0 #0,06. Increase in the height by 1cma in midparetnt height will result to the average incrase by 0.066 cm in doughter higths 
## [1] 0.066
summary(lmmodel4)
## 
## Call:
## lm(formula = childHeight ~ midparentHeight + gender + midparentHeight * 
##     gender, data = GaltonFamilies)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5431 -1.4568  0.0769  1.4795  9.0860 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                18.33348    3.86636   4.742 2.45e-06 ***
## midparentHeight             0.66075    0.05580  11.842  < 2e-16 ***
## gendermale                  1.57998    5.46264   0.289    0.772    
## midparentHeight:gendermale  0.05252    0.07890   0.666    0.506    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.171 on 930 degrees of freedom
## Multiple R-squared:  0.6334, Adjusted R-squared:  0.6322 
## F-statistic: 535.6 on 3 and 930 DF,  p-value: < 2.2e-16
#its is like fish and ice/cream



#Diagnosis of lmmodel4a

ggplot(GaltonFamilies, aes(y = midparentHeight,x= gender)) + geom_boxplot() #we have some outliers we can decide shall we extract them or not

summary (lmmodel4a) 
## 
## Call:
## lm(formula = childHeight ~ midparentHeight + gender, data = GaltonFamilies)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.5317 -1.4600  0.0979  1.4566  9.1110 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     16.51410    2.73392    6.04 2.22e-09 ***
## midparentHeight  0.68702    0.03944   17.42  < 2e-16 ***
## gendermale       5.21511    0.14216   36.69  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.17 on 931 degrees of freedom
## Multiple R-squared:  0.6332, Adjusted R-squared:  0.6324 
## F-statistic: 803.6 on 2 and 931 DF,  p-value: < 2.2e-16
#the model is y=16.514 + 0.687midparentHeight + 5.2151gendermale
#f statistics: An F statistic is a value you get when you run a regression analysis to find out if the means between two populations are significantly different. 
#It’s similar to a T statistic from a T-Test; A-T test will tell you if a single variable is statistically significant and an F test will tell you if a group of variables are jointly significant.

confint(lmmodel4a)
##                      2.5 %     97.5 %
## (Intercept)     11.1487472 21.8794572
## midparentHeight  0.6096139  0.7644165
## gendermale       4.9361179  5.4940929
#Diagnostics
plot(lmmodel4a, which=1) #they are desperzed around zero, we see the data that is "ruining" our model

#lets do correlation
vars <- c("midparentHeight", "childHeight", "mother", "father")
cor(GaltonFamilies[,vars])
##                 midparentHeight childHeight     mother     father
## midparentHeight       1.0000000   0.3209499 0.72783397 0.72843929
## childHeight           0.3209499   1.0000000 0.20132195 0.26603854
## mother                0.7278340   0.2013219 1.00000000 0.06036612
## father                0.7284393   0.2660385 0.06036612 1.00000000
#we see that there is a big correlation between midparentHeight and mother and father. This means we can not
#use in one model those variables togeter 

#Test the initial correlations
#install.packages("psych")
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
vars <- c("midparentHeight", "childHeight", "mother", "father")
corr.test(GaltonFamilies[,vars]) #It allows for all complete data to be used in the correlations
## Call:corr.test(x = GaltonFamilies[, vars])
## Correlation matrix 
##                 midparentHeight childHeight mother father
## midparentHeight            1.00        0.32   0.73   0.73
## childHeight                0.32        1.00   0.20   0.27
## mother                     0.73        0.20   1.00   0.06
## father                     0.73        0.27   0.06   1.00
## Sample Size 
## [1] 934
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##                 midparentHeight childHeight mother father
## midparentHeight               0           0   0.00   0.00
## childHeight                   0           0   0.00   0.00
## mother                        0           0   0.00   0.07
## father                        0           0   0.07   0.00
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option
#VARIANCE INFLATION FACTOR
library (SparseM) #prvo sparsM pa car
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library (car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
vif(lmmodel4a) #vif je variance Inflation Factors.  koristimo da objasnimo koliko multicollinearity (correlation between predictors) postoji u regresionoj analizi.
## midparentHeight          gender 
##        1.001179        1.001179
#High VIFs are a sign of multicollinearity.vif je na neki nacin mjera nepreciznosti tako da je npr ako je 
##VIF 4  za estimated koeficijente je  4x veca za koeficijente radi correlation between the two independent variables.

#rule of thumb:
##VIF = 1 (Not correlated)
##1 < VIF < 5 (Moderately correlated)
##VIF >=5 (Highly correlated) 

#sa nasim rezultatom smo ok VIF je oko 1
##DISPLAYING REGRESSION RESULTS
#displaying regression results
library(xtable)
library(texreg)
## Version:  1.36.23
## Date:     2017-03-03
## Author:   Philip Leifeld (University of Glasgow)
## 
## Please cite the JSS article in your publications -- see citation("texreg").
screenreg(list(lmmodel, lmmodel2, lmmodel3, lmmodel4, lmmodel4a))
## 
## ======================================================================================
##                             Model 1     Model 2     Model 3     Model 4     Model 5   
## --------------------------------------------------------------------------------------
## (Intercept)                  22.64 ***   22.64 ***   64.10 ***   18.33 ***   16.51 ***
##                              (4.27)      (4.26)      (0.12)      (3.87)      (2.73)   
## midparentHeight               0.64 ***                            0.66 ***    0.69 ***
##                              (0.06)                              (0.06)      (0.04)   
## mother                                    0.29 ***                                    
##                                          (0.05)                                       
## father                                    0.37 ***                                    
##                                          (0.04)                                       
## gendermale                                            5.13 ***    1.58        5.22 ***
##                                                      (0.16)      (5.46)      (0.14)   
## midparentHeight:gendermale                                        0.05                
##                                                                  (0.08)               
## --------------------------------------------------------------------------------------
## R^2                           0.10        0.11        0.51        0.63        0.63    
## Adj. R^2                      0.10        0.10        0.51        0.63        0.63    
## Num. obs.                   934         934         934         934         934       
## RMSE                          3.39        3.39        2.50        2.17        2.17    
## ======================================================================================
## *** p < 0.001, ** p < 0.01, * p < 0.05