MTRD Seminar 8 - Nocni klub

podatki <- read.table("./Nocni klub.csv", header=TRUE, sep=";", dec=",")
head(podatki)

##   ID Izmenjava Zabavnost Flirtanje Spol
## 1  1         1         0         9    1
## 2  2         0         0         3    0
## 3  3         0         0         4    0
## 4  4         1         1         6    1
## 5  5         1         1         6    1
## 6  6         0         1         6    1

Opis spremenljivk:

Izmenjava - Ali je oseba izmenjala telefonsko številko (0:NE, 1:DA).
Zabavnost - 0: povsem dolgočasen pogovor, 10: zelo zabaven pogovor.
Flirtanje - 0: povsem brez flirtanja, 10: zelo očitno flirtanje.
Spol - Spol ogovorjene osebe (0:Z, 1:M).

podatki$IzmenjavaF <- factor(podatki$Izmenjava, 
                             levels = c(0, 1), 
                             labels = c("NE", "DA"))

podatki$SpolF <- factor(podatki$Spol, 
                        levels = c(0, 1), 
                        labels = c("Z", "M"))
head(podatki)

##   ID Izmenjava Zabavnost Flirtanje Spol IzmenjavaF SpolF
## 1  1         1         0         9    1         DA     M
## 2  2         0         0         3    0         NE     Z
## 3  3         0         0         4    0         NE     Z
## 4  4         1         1         6    1         DA     M
## 5  5         1         1         6    1         DA     M
## 6  6         0         1         6    1         NE     M

summary(podatki[colnames(podatki) %in% c("IzmenjavaF", "Zabavnost", "Flirtanje", "SpolF")])

##    Zabavnost        Flirtanje      IzmenjavaF SpolF  
##  Min.   : 0.000   Min.   : 0.000   NE:400     Z:672  
##  1st Qu.: 3.000   1st Qu.: 6.000   DA:620     M:348  
##  Median : 4.000   Median : 6.000                     
##  Mean   : 4.372   Mean   : 6.034                     
##  3rd Qu.: 5.000   3rd Qu.: 7.000                     
##  Max.   :10.000   Max.   :10.000

fit0 <- glm(IzmenjavaF ~ 1,  
            family = binomial, 
            data = podatki)

summary(fit0)

## 
## Call:
## glm(formula = IzmenjavaF ~ 1, family = binomial, data = podatki)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.43825    0.06413   6.834 8.28e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1366.2  on 1019  degrees of freedom
## Residual deviance: 1366.2  on 1019  degrees of freedom
## AIC: 1368.2
## 
## Number of Fisher Scoring iterations: 4

head(fitted(fit0))

##         1         2         3         4         5         6 
## 0.6078431 0.6078431 0.6078431 0.6078431 0.6078431 0.6078431

fit1 <- glm(IzmenjavaF ~ Zabavnost + Flirtanje + SpolF,  
            family = binomial, 
            data = podatki)

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:DescTools':
## 
##     Recode

vif(fit1)

## Zabavnost Flirtanje     SpolF 
##  1.572250  1.005366  1.566163

podatki$StdOstanki <- rstandard(fit1)
podatki$CooksD <- cooks.distance(fit1)

library(ggplot2)
StdOst <- ggplot(podatki, aes(x=StdOstanki)) +
           theme_linedraw() +
           geom_histogram() +
           xlab("Standardizirani ostanki")

library(ggplot2)
Cook <- ggplot(podatki, aes(x=CooksD)) +
         theme_linedraw() +
         geom_histogram() +
         xlab("Cookove razdalje")

ggarrange(StdOst, Cook,
          ncol = 2, nrow = 1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

head(podatki[order(podatki$StdOstanki), c("ID", "StdOstanki")], 5)

##        ID StdOstanki
## 1011 1011  -3.095444
## 961   961  -2.688333
## 960   960  -2.675954
## 833   833  -2.457668
## 520   520  -2.239093

head(podatki[order(-podatki$StdOstanki), c("ID", "StdOstanki")], 5)

##      ID StdOstanki
## 95   95   1.885151
## 233 233   1.654892
## 279 279   1.654892
## 238 238   1.638542
## 242 242   1.638542

head(podatki[order(-podatki$CooksD), c("ID", "CooksD")], 5)

##        ID     CooksD
## 1011 1011 0.04390995
## 961   961 0.02116680
## 960   960 0.02086403
## 833   833 0.01318403
## 543   543 0.01184157

library(dplyr)
podatki <- podatki %>%
  filter(!ID %in% c(1011, 961, 960))

fit0 <- glm(IzmenjavaF ~ 1,  
            family = binomial, 
            data = podatki)

fit1 <- glm(IzmenjavaF ~ Zabavnost + Flirtanje + SpolF,  
            family = binomial, 
            data = podatki)

anova(fit0, fit1, test = "Chi")

## Analysis of Deviance Table
## 
## Model 1: IzmenjavaF ~ 1
## Model 2: IzmenjavaF ~ Zabavnost + Flirtanje + SpolF
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      1016     1360.6                          
## 2      1013     1161.7  3   198.88 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(fit1)

## 
## Call:
## glm(formula = IzmenjavaF ~ Zabavnost + Flirtanje + SpolF, family = binomial, 
##     data = podatki)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.26890    0.42711  -7.654 1.96e-14 ***
## Zabavnost    0.64266    0.06756   9.512  < 2e-16 ***
## Flirtanje    0.03897    0.04725   0.825    0.409    
## SpolFM       2.38519    0.20720  11.511  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1360.6  on 1016  degrees of freedom
## Residual deviance: 1161.7  on 1013  degrees of freedom
## AIC: 1169.7
## 
## Number of Fisher Scoring iterations: 4

exp(cbind(RO = fit1$coefficients, confint.default(fit1)))

##                      RO      2.5 %      97.5 %
## (Intercept)  0.03804816 0.01647315  0.08788016
## Zabavnost    1.90154076 1.66569156  2.17078441
## Flirtanje    1.03973908 0.94777891  1.14062188
## SpolFM      10.86109909 7.23603715 16.30222053

fit2 <- glm(IzmenjavaF ~ Zabavnost + Flirtanje + SpolF + Zabavnost:SpolF + Flirtanje:SpolF,  
            family = binomial, 
            data = podatki)

anova(fit0, fit1, fit2, test = "Chi")

## Analysis of Deviance Table
## 
## Model 1: IzmenjavaF ~ 1
## Model 2: IzmenjavaF ~ Zabavnost + Flirtanje + SpolF
## Model 3: IzmenjavaF ~ Zabavnost + Flirtanje + SpolF + Zabavnost:SpolF + 
##     Flirtanje:SpolF
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      1016     1360.6                          
## 2      1013     1161.7  3  198.882 < 2.2e-16 ***
## 3      1011     1142.2  2   19.456  5.96e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(fit2)

## 
## Call:
## glm(formula = IzmenjavaF ~ Zabavnost + Flirtanje + SpolF + Zabavnost:SpolF + 
##     Flirtanje:SpolF, family = binomial, data = podatki)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.19086    0.51029  -6.253 4.03e-10 ***
## Zabavnost         0.75407    0.08341   9.040  < 2e-16 ***
## Flirtanje        -0.06278    0.05561  -1.129 0.258966    
## SpolFM            1.52604    0.79894   1.910 0.056123 .  
## Zabavnost:SpolFM -0.39882    0.14605  -2.731 0.006321 ** 
## Flirtanje:SpolFM  0.38644    0.10452   3.697 0.000218 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1360.6  on 1016  degrees of freedom
## Residual deviance: 1142.2  on 1011  degrees of freedom
## AIC: 1154.2
## 
## Number of Fisher Scoring iterations: 4

exp(cbind(RO = fit2$coefficients, confint.default(fit2)))

##                          RO      2.5 %     97.5 %
## (Intercept)      0.04113665 0.01513094  0.1118387
## Zabavnost        2.12562825 1.80503220  2.5031661
## Flirtanje        0.93914973 0.84216129  1.0473079
## SpolFM           4.59994116 0.96092821 22.0198123
## Zabavnost:SpolFM 0.67111186 0.50405006  0.8935444
## Flirtanje:SpolFM 1.47172913 1.19910538  1.8063355

podatki$OceneVerjet <- fitted(fit2)

podatki$Uvrstitev <- ifelse(podatki$OceneVerjet > 0.5, 1, 0)

podatki$UvrstitevF <- factor(podatki$Uvrstitev, 
                             levels = c(0, 1), 
                             labels = c("NE", "DA"))

head(podatki[, c("IzmenjavaF", "OceneVerjet", "UvrstitevF")], 10)

##    IzmenjavaF OceneVerjet UvrstitevF
## 1          DA  0.77697191         DA
## 2          NE  0.03295198         NE
## 3          NE  0.03100902         NE
## 4          DA  0.65302927         DA
## 5          DA  0.65302927         DA
## 6          NE  0.65302927         DA
## 7          NE  0.41615169         NE
## 8          DA  0.57657375         DA
## 9          NE  0.72232808         DA
## 10         NE  0.49626663         NE

razvrst_tabela <- table(podatki$IzmenjavaF, podatki$UvrstitevF)
razvrst_tabela

##     
##       NE  DA
##   NE 171 226
##   DA  81 539

Psevdo_R2_fit2 <- ((razvrst_tabela[1, 1] + razvrst_tabela[2, 2] )/ nrow(podatki))
Psevdo_R2_fit2

## [1] 0.6981318