In-class exercises 1

Consider the outcome of a brightness discrimination experiment. On each trial, two patches of light were presented: one was a standatd with an intensity of 35 units, and another was a variable intensity comparison light. The subject was asked to indicate which one of the lights was brighter.

Source: Gegenfurtner, K. (1992). Praxis: Brent’s algorithm for function minimization. Behavior Research Methods, Instruments, and Computers. 24(4), 560-564.

  • Column 1: Stimulus condition
  • Column 2: Intensity of the variable (comparison) light
  • Column 3: Number of comparisons
  • Column 4: Number of times comparison light was judged brighter

Data preparation

pacman::p_load(tidyverse, ggplot2)
# input data
dta <- read.table("brightness.txt", h=T)

head(dta)
##   Stimulus Intensity Trial Yes
## 1        1        10    49   2
## 2        2        20    48   3
## 3        3        30    48  13
## 4        4        40    50  31
## 5        5        50    47  38
## 6        6        60    49  48
# observed proportions -Yes的人在trial裡佔的比例
dta$Prop <-  dta$Yes/dta$Trial

Use glm to fit a psychophysical function

Probit model的係數可以用於計算Z分,計算累積機率密度

summary(m0 <- glm(cbind(Yes, Trial - Yes) ~ Intensity, data = dta, 
                  family = binomial(link = "probit")))
## 
## Call:
## glm(formula = cbind(Yes, Trial - Yes) ~ Intensity, family = binomial(link = "probit"), 
##     data = dta)
## 
## Deviance Residuals: 
##       1        2        3        4        5        6  
##  1.0388  -0.7613  -0.3124   0.4486  -0.5979   0.7050  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.874139   0.286787  -10.02   <2e-16 ***
## Intensity    0.077472   0.007367   10.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 184.5899  on 5  degrees of freedom
## Residual deviance:   2.8119  on 4  degrees of freedom
## AIC: 26.549
## 
## Number of Fisher Scoring iterations: 4

-For a one unit increase in Intensity, the z-score increases by 0.07

# new values in the intensity range 
newxval <- with(dta, data.frame(Intensity = seq(5,70, by = 0.5)))

# predicted probabilities
phat <- predict(m0, newdata = newxval, type = "response" )
#
ot <- theme_set(theme_bw())
# fitted psychophysical curve
ggplot() +
 geom_line(aes(x=newxval[,1], y = phat)) +
 geom_point(aes(x = dta$Intensity, y = dta$Prop)) +
 geom_vline(xintercept = 35, lty = 2, col = "gray") +
 geom_hline(yintercept = .5, lty = 2, col = "gray") +
 labs(x = "Intensity", y = "Probability of correnct responses")

In class Exercise 2

Silvapulle (1981) reported a study on the relation between psychiatric diagnosis and the score on a 12-item General Health Questionnaire (GHQ) for 120 patients admitted to a hospital. Download the ghq.rda data file to your “data” folder and load it to your R session with the command

  • Column 1: Gender ID
  • Column 2: GHQ score
  • Column 3: Number of cases
  • Column 4: Number of non-cases

The patient was classified by psychiatrists as either a ‘case’ or a ‘non-case’. The question of interest was whether GHQ score, together with patient’s gender, could be used to detect psychiatric cases. Perform an appropriate analysis.

load("ghq.rda")
str(ghq)
## 'data.frame':    22 obs. of  4 variables:
##  $ sex: Factor w/ 2 levels "men","women": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ghq: int  0 1 2 3 4 5 6 7 8 9 ...
##  $ c  : int  0 0 1 0 1 3 0 2 0 0 ...
##  $ nc : int  18 8 1 0 0 0 0 0 0 0 ...
dta_2 <-  ghq %>% mutate(Total = c + nc) %>%
  dplyr::rename(Gender=sex, Score=ghq, Case=c, Noncase=nc)
head(dta_2)
##   Gender Score Case Noncase Total
## 1    men     0    0      18    18
## 2    men     1    0       8     8
## 3    men     2    1       1     2
## 4    men     3    0       0     0
## 5    men     4    1       0     1
## 6    men     5    3       0     3
knitr::kable(t(aggregate(cbind(Case, Noncase) ~ Score, data=dta_2, sum)))
Score 0 1 2 3 4 5 6 7 8 9 10
Case 2 2 5 3 3 6 1 3 3 1 1
Noncase 60 22 6 1 1 0 0 0 0 0 0

Gender and GHQ score effects

ggplot(dta_2, aes(x=Score, y=Case/Total, color=Gender))+
  geom_point(pch=1, size=rel(2))+
  stat_smooth(method = "glm", 
              formula=y~x, 
              method.args = list(family="binomial"),
              aes(weight=Total), se=F)+
  scale_color_manual(values=c("black", "red3"))+
  scale_x_continuous(breaks=seq(0, 10, by=1))+
  scale_y_continuous(breaks=seq(0, 1, by=.1))+
  labs(x="Score on GHQ", 
       y="Proportion of cases") +
  theme_minimal()+
  theme(legend.position=c(.1, .9))

#full model
m0 <- glm(cbind(Case, Noncase) ~ Score + Gender + Score:Gender, data=dta_2, family=binomial)
# decrease interaction
m1 <- glm(cbind(Case, Noncase) ~ Score + Gender, data=dta_2, family=binomial)
#decrease gender
m2 <- glm(cbind(Case, Noncase) ~ Score, data=dta_2, family=binomial)
# compare models
anova(m2, m1, m0, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: cbind(Case, Noncase) ~ Score
## Model 2: cbind(Case, Noncase) ~ Score + Gender
## Model 3: cbind(Case, Noncase) ~ Score + Gender + Score:Gender
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1        15     5.7440                       
## 2        14     4.9419  1   0.8021  0.37047  
## 3        13     1.5422  1   3.3997  0.06521 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • There no significant differences between 3 models
m3 <- glm(cbind(Case, Noncase) ~ Gender, data=dta_2, family=binomial(link=logit)) #add (link=logit)
# compare the difference with or without link=logit
anova(m3, m1, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: cbind(Case, Noncase) ~ Gender
## Model 2: cbind(Case, Noncase) ~ Score + Gender
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        15     83.054                          
## 2        14      4.942  1   78.112 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

-There is significant differences between m1 and m3 models.

The proportion of deviance accounted for

(m1$null.deviance - m1$deviance)/m1$null.deviance
## [1] 0.9405854
1 - pchisq(m1$deviance, m1$df.residual)
## [1] 0.9866052

-The m1 model fits well

sjPlot::tab_model(m1, show.r2=FALSE, show.obs=FALSE, show.se=TRUE)
  cbind(Case, Noncase)
Predictors Odds Ratios std. Error CI p
(Intercept) 0.02 0.98 0.00 – 0.08 <0.001
Score 4.19 0.29 2.57 – 8.20 <0.001
Gender [women] 2.21 0.93 0.42 – 17.73 0.393

-The Gender factor doesn’t make significant impact on corresponding to a patient. - For a given GHQ score, the odds of a woman being a case is between 2.57 to 8.20 of the corresponding odds for a man.

In Class Exercise 3

A recent General Social Survey asked subjects, “Within the past 12 months, how many people have you known personally that were victims of homicide?” The responses by race, for those who identified themselves as black or as white, were collected.

Source: Agresti, A. (2002). Categorical Data Analysis. p. 561.

  • Column 1: Number of victims personally known to the respondent
  • Column 2: Race ID
dta_3 <- read.csv("victims.csv")
head(dta_3)
##   nV  Race
## 1  0 White
## 2  0 White
## 3  0 White
## 4  0 White
## 5  0 White
## 6  0 White
colnames(dta_3)<-c("No.victim", "Race")
dta_3$Race<-factor(dta_3$Race, levels = c("White", "Black"), ordered = T)
xtabs(~No.victim+Race, dta_3)
##          Race
## No.victim White Black
##         0  1070   119
##         1    60    16
##         2    14    12
##         3     4     7
##         4     0     3
##         5     0     2
##         6     1     0
a<-aggregate(No.victim~Race, data=dta_3, FUN=mean)
b<-aggregate(No.victim~Race, data=dta_3, FUN=var)
d<-merge(a,b,by="Race", rownames=F)
colnames(d)<-c("", "means", "var")
t(d)
##       [,1]         [,2]        
##       "Black"      "White"     
## means "0.52201258" "0.09225413"
## var   "1.1498288"  "0.1552448"

Goodness of fit and overdispersion

md<-glm(No.victim~Race, data=dta_3,family=poisson)
1 - pchisq(deviance(md), df.residual(md))
## [1] 1
performance::check_overdispersion(md)
## # Overdispersion test
## 
##        dispersion ratio =    1.746
##   Pearson's Chi-Squared = 2279.873
##                 p-value =  < 0.001
as.data.frame(summary(md, dispersion=1.746)$coef)
##              Estimate Std. Error    z value     Pr(>|z|)
## (Intercept) -1.516636 0.09683463 -15.662123 2.745770e-55
## Race.L       1.225518 0.13694485   8.948992 3.587447e-19
exp(-0.65)
## [1] 0.5220458
exp(-0.65-1.7331)
## [1] 0.09226411

Exercise 1

Exercise 2

The data set days absent contains attendance data on 314 high school juniors from high schools. The response variable of interest is days absent from school. The standardized math score for each student was given as well as the program variable indicating the type of instructional program in which the student was enrolled.

  • Column 1: Student ID
  • Column 2: Gender ID
  • Column 3: Program ID
  • Column 4: Standardized mathematics score
  • Column 5: Days absent from school

The interest is to model the number of days of absence from school using student’s gender, the standardized test in math, and the type of program in which the student is enrolled.

dta2<- read.csv("days_absent.csv")
head(dta2)
##      ID Gender  Program Math Days
## 1 S1001   male Academic   63    4
## 2 S1002   male Academic   27    4
## 3 S1003 female Academic   20    2
## 4 S1004 female Academic   16    3
## 5 S1005 female Academic    2    3
## 6 S1006 female Academic   71   13
str(dta2)
## 'data.frame':    314 obs. of  5 variables:
##  $ ID     : chr  "S1001" "S1002" "S1003" "S1004" ...
##  $ Gender : chr  "male" "male" "female" "female" ...
##  $ Program: chr  "Academic" "Academic" "Academic" "Academic" ...
##  $ Math   : int  63 27 20 16 2 71 63 3 51 49 ...
##  $ Days   : int  4 4 2 3 3 13 11 7 10 9 ...
ggplot(dta2, 
       aes(Days)) +
 geom_histogram(binwidth = 1, fill=8, col=1)+
 facet_grid(Gender ~ Program) +
 labs(x="Days absent", 
      y="Frequency") +
 theme_minimal()+
 theme(legend.position = c(.95, .9))

### means and variances

aggregate(cbind(Days, Math)~ Gender+Program, data=dta2, FUN=mean)
##   Gender    Program      Days     Math
## 1 female   Academic  7.759036 43.69880
## 2   male   Academic  6.119048 41.50000
## 3 female    General 12.363636 44.40909
## 4   male    General  8.555556 45.33333
## 5 female Vocational  2.709091 58.00000
## 6   male Vocational  2.634615 58.84615
aggregate(cbind(Days, Math)~ Gender+Program, data=dta2, FUN=var)
##   Gender    Program     Days     Math
## 1 female   Academic 68.55098 586.7252
## 2   male   Academic 41.81698 689.8193
## 3 female    General 84.33766 804.5390
## 4   male    General 41.67320 923.5294
## 5 female Vocational 10.28418 512.4815
## 6   male Vocational 18.07956 364.9170

Effects of Program, gender, and math

ggplot(dta2, 
       aes(Math, Days, 
           color=Gender)) +
 stat_smooth(method="glm",
             formula=y ~ x,
             method.args=list(family=poisson),
             size=rel(.5)) +
 geom_point(pch=20, alpha=.5) +
 facet_grid(. ~ Program) +
 labs(y="Days absent", 
      x="Math score") +
 theme_minimal()+
 theme(legend.position = c(.95, .9))

### Parameter estimates

sjPlot::tab_model(m0 <- glm(Days ~ Program*Gender*Math, data=dta2, 
                            family=poisson(link=log)), show.se=T, show.r2=F, show.obs=F)
  Days
Predictors Incidence Rate Ratios std. Error CI p
(Intercept) 10.43 0.08 8.94 – 12.14 <0.001
Program [General] 1.47 0.13 1.13 – 1.91 0.004
Program [Vocational] 0.23 0.25 0.14 – 0.36 <0.001
Gender [male] 0.85 0.11 0.69 – 1.06 0.146
Math 0.99 0.00 0.99 – 1.00 <0.001
Program [General] *
Gender [male]
0.97 0.20 0.65 – 1.43 0.863
Program [Vocational] *
Gender [male]
1.59 0.37 0.76 – 3.27 0.214
Program [General] * Math 1.00 0.00 1.00 – 1.01 0.479
Program [Vocational] *
Math
1.01 0.00 1.00 – 1.02 0.019
Gender [male] * Math 1.00 0.00 0.99 – 1.00 0.274
(Program [General]
Gender [male])
Math
1.00 0.00 0.99 – 1.01 0.692
(Program [Vocational]
Gender [male])
Math
1.00 0.01 0.98 – 1.01 0.634

Goodness of fit and overdispersion

1 - pchisq(deviance(m0), df.residual(m0))
## [1] 0
performance::check_overdispersion(m0)
## # Overdispersion test
## 
##        dispersion ratio =    6.492
##   Pearson's Chi-Squared = 1960.446
##                 p-value =  < 0.001
plot(log(fitted(m0)), log((dta2$Days-fitted(m0))^2), 
     xlab=expression(hat(mu)),
     ylab=expression((y-hat(mu))^2))
grid()
abline(0, 1, col=2)

Dispersion parameter

knitr::kable(as.data.frame(summary(m0, dispersion=6.492)$coef))
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.3450136 0.1989867 11.7847779 0.0000000
ProgramGeneral 0.3873043 0.3411743 1.1352096 0.2562875
ProgramVocational -1.4878389 0.6257345 -2.3777477 0.0174187
Gendermale -0.1588403 0.2785265 -0.5702878 0.5684825
Math -0.0071045 0.0043252 -1.6425893 0.1004679
ProgramGeneral:Gendermale -0.0347250 0.5144980 -0.0674929 0.9461893
ProgramVocational:Gendermale 0.4617659 0.9469746 0.4876222 0.6258175
ProgramGeneral:Math 0.0019809 0.0071360 0.2775861 0.7813301
ProgramVocational:Math 0.0094842 0.0103312 0.9180121 0.3586125
Gendermale:Math -0.0026914 0.0062654 -0.4295662 0.6675112
ProgramGeneral:Gendermale:Math -0.0017088 0.0110050 -0.1552787 0.8766016
ProgramVocational:Gendermale:Math -0.0029735 0.0159262 -0.1867067 0.8518906