library(haven)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0     v readr   1.2.1
## v tibble  2.1.3     v purrr   0.2.5
## v tidyr   0.8.2     v stringr 1.3.1
## v ggplot2 3.1.0     v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(readxl)
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2019 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(data.table)
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
## 
##     transpose
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(psycho)
library(ggplot2)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(DescTools)
## 
## Attaching package: 'DescTools'
## The following object is masked from 'package:data.table':
## 
##     %like%
library(ggpubr)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:DescTools':
## 
##     Recode
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
library(tibble)
library(nnet)

disleksija <- read_sav("D:/Vita/Disleksija/Baze/disleksija_final.sav")
disleksija$Disleksija <- as.factor(disleksija$Disleksija)
disleksija$Pol <- as.factor(disleksija$Pol)
disleksija$UzrasnaGrupa <- as.factor(disleksija$UzrasnaGrupa)

#creating training and test subset
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
## The following object is masked from 'package:purrr':
## 
##     lift
library(caTools)
set.seed(101)

split = sample.split(disleksija$Disleksija, SplitRatio = 0.70)


final.train = subset(disleksija, split == TRUE)
final.test = subset(disleksija, split == FALSE)


#logit model with all predictors with 3 fold cross validation

fitControl <- trainControl(## 10-fold CV
  method = "cv",
  number = 3,
  savePredictions = TRUE
)


#MODEL 1 - sve i uzrast i konacni skor i svi subtestovi

model1<-caret::train(Disleksija ~ Uzrast + 
                       Brzo_imenovanje + 
                       Nizanje_kuglica +
                       Jednominutno_citanje + 
                       Posturalna_stabilnost + 
                       Fonemska_segmentacija + 
                       Dvominutni_speling +
                       Brojcani_niz_unazad + 
                       Citanje_besmislenog_teksta + 
                       Jednominutno_pisanje +
                       Verbalna_fluentnost +
                       Semanticka_fluentnost +
                       Vokabular +
                       ARQbroj,data=disleksija,method="glm",family=binomial(),
                     trControl=fitControl)

pred <- predict(model1, newdata = final.test, type = "raw")
confusionMatrix(pred, final.test$Disleksija)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 180   0
##          1   0   9
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9807, 1)
##     No Information Rate : 0.9524     
##     P-Value [Acc > NIR] : 9.891e-05  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.9524     
##          Detection Rate : 0.9524     
##    Detection Prevalence : 0.9524     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 0          
## 
#Plotovanje modela za razlicite grupe ispitanika


model <- glm(Disleksija ~ ARQbroj + 
               UzrasnaGrupa,data=final.train, na.action="na.exclude",  family="binomial") 

model$coef
##   (Intercept)       ARQbroj UzrasnaGrupa2 UzrasnaGrupa3 UzrasnaGrupa4 
##  -27.99556321    3.31102794   18.62343348   -3.64857213    0.21736686 
## UzrasnaGrupa5 
##    0.03972801
contrasts(final.test$UzrasnaGrupa)
##   2 3 4 5
## 1 0 0 0 0
## 2 1 0 0 0
## 3 0 1 0 0
## 4 0 0 1 0
## 5 0 0 0 1
b0 <- model$coef[1] # intercept
X1 <- model$coef[2]
uzrasnagrupa2 <- model$coef[3]
uzrasnagrupa3 <- model$coef[4]
uzrasnagrupa4 <- model$coef[5]
uzrasnagrupa5 <- model$coef[6]


X1_range <- seq(from=min(final.train$ARQbroj), to=(ARQbroj=50), by=.01)

referentna <- b0 + 
  X1*X1_range +
  uzrasnagrupa2*0 + 
  uzrasnagrupa3*0 + 
  uzrasnagrupa4*0 + 
  uzrasnagrupa5*0  

uzrast2_log <- b0 + 
  X1*X1_range +
  uzrasnagrupa2*1 + 
  uzrasnagrupa3*0 + 
  uzrasnagrupa4*0 + 
  uzrasnagrupa5*0 

uzrast3_log <- b0 + 
  X1*X1_range +
  uzrasnagrupa2*0 + 
  uzrasnagrupa3*1 + 
  uzrasnagrupa4*0 + 
  uzrasnagrupa5*0 

uzrast4_log <- b0 + 
  X1*X1_range +
  uzrasnagrupa2*0 + 
  uzrasnagrupa3*0 + 
  uzrasnagrupa4*1 + 
  uzrasnagrupa5*0   
  
uzrast5_log <- b0 + 
  X1*X1_range +
  uzrasnagrupa2*0 + 
  uzrasnagrupa3*0 + 
  uzrasnagrupa4*0 + 
  uzrasnagrupa5*1  
# Compute the probibilities (this is what will actually get plotted):

referentna_prob <- exp(referentna)/(1 + exp(referentna))
uzrast2_prob <- exp(uzrast2_log)/(1 + exp(uzrast2_log))
uzrast3_prob <- exp(uzrast3_log)/(1 + exp(uzrast3_log))
uzrast4_prob <- exp(uzrast4_log)/(1 + exp(uzrast4_log))
uzrast5_prob <- exp(uzrast5_log)/(1 + exp(uzrast5_log))

# We'll start by plotting the ref group:
plot(X1_range, referentna_prob, 
     ylim=c(0,1),
     type="l", 
     lwd=3, 
     lty=2, 
     col="gold", 
     xlab="Postignuce na DST-J", ylab="P(ishod)", main="Verovatnoca postojanja disleksije")




# Add the line for people who are in the b group
lines(X1_range, uzrast2_prob, 
      type="l", 
      lwd=3, 
      lty=3, 
      col="green")

# Add the line for people who are in the c group
lines(X1_range, uzrast3_prob, 
      type="l", 
      lwd=3, 
      lty=4, 
      col="blue")

lines(X1_range, uzrast4_prob, 
      type="l", 
      lwd=3, 
      lty=4, 
      col="orangered")

lines(X1_range, uzrast5_prob, 
      type="l", 
      lwd=3, 
      lty=4, 
      col="red")

# add a horizontal line at p=.5
abline(h=.5, lty=2)

legend("bottomright", legend = c("Od 6,6 do 7,5 godina", 
                        "Od 7,6 do 8,5 godina", 
                        "Od 8,6 do 9,5 godina", 
                        "Od 9,6 do 10,5 godina", 
                        "Od 10,6 do 11,5 godina"), col=c("gold", "green", "blue", "orangered", "red"),lty=1:2, cex=0.8)

#With ggplot

library(ggplot2)
library(tidyr)
# first you have to get the information into a long dataframe, which is what ggplot likes :)
plot.data <- data.frame(uzrast1=referentna_prob, uzrast2=uzrast2_prob, uzrast3=uzrast3_prob, uzrast4=uzrast4_prob, uzrast5=uzrast5_prob, X1=X1_range)
plot.data <- gather(plot.data, key=group, value=uzrast, 1:5)
head(plot.data)
##     X1   group       uzrast
## 1 0.00 uzrast1 6.945146e-13
## 2 0.01 uzrast1 7.178951e-13
## 3 0.02 uzrast1 7.420627e-13
## 4 0.03 uzrast1 7.670439e-13
## 5 0.04 uzrast1 7.928661e-13
## 6 0.05 uzrast1 8.195575e-13
ggplot(plot.data, aes(x=X1, y=uzrast, color=group)) + # asking it to set the color by the variable "group" is what makes it draw three different lines
  geom_line(lwd=2) + 
  labs(x="Postignuce na DST-J", y="P(ishod)", title="Verovatnoca za disleksiju u zavisnosti od skora na DST-J")