1 Description

A recent General Social Survey asked subjects, “Within the past 12 months, how many people have you known personally that were victims of homicide?”
The responses by race, for those who identified themselves as black or as white, were collected.

Column 1: Number of victims personally known to the respondent
Column 2: Race ID

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

2 input data

dta3 <- read.csv("C:/Users/HANK/Desktop/HOMEWORK/victims.csv")
dta3 <- mutate(dta3, 
               Race = factor(Race))
str(dta3)

## 'data.frame':    1308 obs. of  2 variables:
##  $ nV  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Race: Factor w/ 2 levels "Black","White": 2 2 2 2 2 2 2 2 2 2 ...

head(dta3)

##   nV  Race
## 1  0 White
## 2  0 White
## 3  0 White
## 4  0 White
## 5  0 White
## 6  0 White

3 Descriptive data

3.1 Form

t1 <- table(dta3$nV, dta3$Race)

# ftable(mytable)

kbl(t1)%>%
  kable_styling(bootstrap_options = c("striped", "hover")
              , full_width = F)%>%
  add_header_above(c("Number of Victims", "Race" = 2))

Number of Victims	Race
	Black	White
0	119	1070
1	16	60
2	12	14
3	7	4
4	3	0
5	2	0
6	0	1

檢視表格中，的黑人與白人種族，樣本數是不平均的，白人明顯多於黑人。

3.2 Mean & variance

dta3mean <- aggregate(nV ~ Race, data = dta3, mean)
dta3var <- aggregate(nV ~ Race, data = dta3, var)
t2 <- Hmisc::Merge(dta3mean, dta3var, id = ~ Race)

##          Vars Obs Unique IDs IDs in #1 IDs not in #1
## dta3mean    2   2          2        NA            NA
## dta3var     2   2          2         2             0
## Merged      3   2          2         2             0
## 
## Number of unique IDs in any data frame : 2 
## Number of unique IDs in all data frames: 2

colnames(t2) <- c("Race", "mean", "var")

kbl(t2)%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)

Race	mean	var
Black	0.5220126	1.1498288
White	0.0922541	0.1552448

自稱是黑人的人，他們所認識的兇殺受害者的人數比白人認識的兇殺受害者的人數更多(特別是在3人以上，所佔整體的比例高)。上表平均數亦顯示黑人的0.522高於白人的0.092。意即黑人種族會知道更多的兇殺受害者。這個現象或許可以解釋為存有“種族”這個因素的影響。

4 plot

4.1 Bar chart

ggplot(dta3, 
       aes(nV)) +
 geom_histogram(binwidth = 1, fill=8, col=1)+
 facet_grid(~Race) +
 labs(x="Number of victims", 
      y="Number of the person who response") +
 theme_minimal()+
 theme(legend.position = c(.95, .9))

4.2 xxx

dta3$nVc <- dta3$nV
dta3$nVc[dta3$nVc >= 0 ] <- "1"
tail(dta3)

##      nV  Race nVc
## 1303  4 Black   1
## 1304  4 Black   1
## 1305  4 Black   1
## 1306  5 Black   1
## 1307  5 Black   1
## 1308  6 White   1

4.3 plot2

dta3$nVc <- as.integer(dta3$nVc, ref="0")
dta3$Race <- relevel(dta3$Race, ref = "White")

ggplot(dta3, 
       aes(Race, nV, color=Race)) +
 stat_smooth(method="glm",
             formula= nV ~ Race,
             method.args=list(family=poisson),
             size=rel(.5)) +
 geom_point(pch=20, alpha=.5) +
 #facet_grid(. ~ Race) +
 labs(y="Number of victims", 
      x="Race") +
 theme_minimal()+
 theme(legend.position = c(.95, .9))

以種族為分組依據，繪製圖表

5 GLM

5.1 Poisson distribution

dta3$nVc <- dta3$nV
dta3$nVc[dta3$nVc >= 0 ] <- "1"
tail(dta3)

##      nV  Race nVc
## 1303  4 Black   1
## 1304  4 Black   1
## 1305  4 Black   1
## 1306  5 Black   1
## 1307  5 Black   1
## 1308  6 White   1

dta3$nVc <- as.integer(dta3$nVc, ref="0")

summary(m0p <- glm(nV ~ Race, data=dta3, family=poisson(link=log)))

## 
## Call:
## glm(formula = nV ~ Race, family = poisson(link = log), data = dta3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0218  -0.4295  -0.4295  -0.4295   6.1874  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.38321    0.09713  -24.54   <2e-16 ***
## RaceBlack    1.73314    0.14657   11.82   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 962.80  on 1307  degrees of freedom
## Residual deviance: 844.71  on 1306  degrees of freedom
## AIC: 1122
## 
## Number of Fisher Scoring iterations: 6

5.2 Parameter estimates

sjPlot::tab_model(m0p, show.se=T, show.r2=F, show.obs=F)

	nV
Predictors	Incidence Rate Ratios	std. Error	CI	p
(Intercept)	0.09	0.10	0.08 – 0.11	<0.001
Race [Black]	5.66	0.15	4.24 – 7.53	<0.001

5.3 Goodness of fit and overdispersion

1 - pchisq(deviance(m0p), df.residual(m0p))

## [1] 1

performance::check_overdispersion(m0p)

## # Overdispersion test
## 
##        dispersion ratio =    1.746
##   Pearson's Chi-Squared = 2279.873
##                 p-value =  < 0.001

## Overdispersion detected.

資料過度分散

plot(log(fitted(m0p)), log((dta3$nV-fitted(m0p))^2), 
     xlab=expression(hat(mu)),
     ylab=expression((y-hat(mu))^2))
grid()
abline(0, 1, col=2)

5.4 Dispersion parameter

kbl(as.data.frame(summary(m0p, dispersion=1.746)$coef))%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)

	Estimate	Std. Error	z value	Pr(>\|z\|)
(Intercept)	-2.383208	0.1283420	-18.569195	0
RaceBlack	1.733145	0.1936693	8.948991	0

drop1(m0p, test="F")

## Warning in drop1.glm(m0p, test = "F"): F test assumes 'quasipoisson' family

## Single term deletions
## 
## Model:
## nV ~ Race
##        Df Deviance    AIC F value    Pr(>F)    
## <none>      844.71 1122.0                      
## Race    1   962.80 1238.1  182.58 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

5.5 Negative binominal

require(MASS)

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

summary(m0n <- glm.nb(nV ~ Race, data = dta3))

## 
## Call:
## glm.nb(formula = nV ~ Race, data = dta3, init.theta = 0.2023119205, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7184  -0.3899  -0.3899  -0.3899   3.5072  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.3832     0.1172 -20.335  < 2e-16 ***
## RaceBlack     1.7331     0.2385   7.268 3.66e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.2023) family taken to be 1)
## 
##     Null deviance: 471.57  on 1307  degrees of freedom
## Residual deviance: 412.60  on 1306  degrees of freedom
## AIC: 1001.8
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2023 
##           Std. Err.:  0.0409 
## 
##  2 x log-likelihood:  -995.7980

MASS::stepAIC(glm.nb(nV ~ Race, data = dta3), 
               scope=list(upper = ~ Race, lower= ~ 1), trace=FALSE)

## 
## Call:  glm.nb(formula = nV ~ Race, data = dta3, init.theta = 0.2023119205, 
##     link = log)
## 
## Coefficients:
## (Intercept)    RaceBlack  
##      -2.383        1.733  
## 
## Degrees of Freedom: 1307 Total (i.e. Null);  1306 Residual
## Null Deviance:       471.6 
## Residual Deviance: 412.6     AIC: 1002

sjPlot::tab_model(m0n, show.df =T, show.se=T, show.r2=T, show.obs=T, show.aic = T, show.fstat = T,
df.method = "wald", transform=NULL)

	nV
Predictors	Log-Mean	std. Error	CI	p	df
(Intercept)	-2.38	0.12	-2.61 – -2.15	<0.001	Inf
Race [Black]	1.73	0.24	1.27 – 2.20	<0.001	Inf
Observations	1308
R² Nagelkerke	0.146
AIC	1001.798

1026_In Class Exercise 3

Pei-Yun,Li

2020-10-28