final_exam

Question 1

四步驟法

#gender -> math 顯著

summary(r1_1 <- lm(math ~ gender,data = dta1))

## 
## Call:
## lm(formula = math ~ gender, data = dta1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -442.33  -53.77    3.79   59.77  247.52 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  578.534      2.070 279.496  < 2e-16 ***
## gendergirl    -9.153      2.853  -3.208  0.00135 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 83.31 on 3418 degrees of freedom
## Multiple R-squared:  0.003002,   Adjusted R-squared:  0.00271 
## F-statistic: 10.29 on 1 and 3418 DF,  p-value: 0.00135

#math -> phy 顯著

summary(r1_2 <- lm(phy ~ math,data = dta1))

## 
## Call:
## lm(formula = phy ~ math, data = dta1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -253.479  -36.717    0.581   37.567  201.178 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 135.94790    6.82300   19.93   <2e-16 ***
## math          0.74482    0.01177   63.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57.41 on 3418 degrees of freedom
## Multiple R-squared:  0.5396, Adjusted R-squared:  0.5394 
## F-statistic:  4005 on 1 and 3418 DF,  p-value: < 2.2e-16

#gender -> phy 顯著

summary(r1_3 <- lm(phy ~ gender,data = dta1))

## 
## Call:
## lm(formula = phy ~ gender, data = dta1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -417.62  -51.64    4.58   57.06  294.82 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  569.474      2.097 271.575  < 2e-16 ***
## gendergirl   -11.794      2.890  -4.081  4.6e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 84.4 on 3418 degrees of freedom
## Multiple R-squared:  0.004848,   Adjusted R-squared:  0.004557 
## F-statistic: 16.65 on 1 and 3418 DF,  p-value: 4.596e-05

#gender -> phy 兩者皆顯著(部分中介)

summary(r1_4 <- lm(phy ~ gender+math,data = dta1))

## 
## Call:
## lm(formula = phy ~ gender + math, data = dta1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -256.339  -36.665    1.044   37.076  203.737 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 139.51472    6.96099  20.042   <2e-16 ***
## gendergirl   -4.99229    1.96748  -2.537   0.0112 *  
## math          0.74319    0.01178  63.104   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57.36 on 3417 degrees of freedom
## Multiple R-squared:  0.5404, Adjusted R-squared:  0.5402 
## F-statistic:  2009 on 2 and 3417 DF,  p-value: < 2.2e-16

Sobel test

a <- summary(r1_1)$coefficients['gendergirl','Estimate']
ase <- summary(r1_1)$coefficients['gendergirl','Std. Error']
b <- summary(r1_4)$coefficients['math','Estimate']
bse <- summary(r1_4)$coefficients['math','Std. Error']

ab <- a*b
abse <- sqrt(ase*ase*b*b+bse*bse*a*a)
zab <- ab/abse
pab <- (1-pnorm(abs(zab)))*2
zab##z值

## [1] -3.203734

pab

## [1] 0.001356575

Bootstrapping

cresult1 <- matrix(NA,5000,1)
n1 <- dim(dta1)[1]
for (i in 1:5000){
  #抽樣，製造新的樣本。
  newdat <- dta1[sample(1:n1,n1,replace=T),]
  #在新樣本上跑迴歸，擷取迴歸係數計算中介效果
  r1_5<-lm(math ~ gender,data=newdat)
  r1_6<-lm(phy ~ gender+math,data=newdat)
  a<-summary(r1_5)$coefficients['gendergirl','Estimate']
  b<-summary(r1_6)$coefficients['math','Estimate']
  ab<-a*b
  cresult1[i]<-ab
}

quantile(cresult1,c(.025,.975))##信賴區間

##       2.5%      97.5% 
## -10.909362  -2.677779

Question 2

cresult2 <- matrix(NA,5000,1)
n2 <- dim(dta2)[1]
for (i in 1:5000){
  #抽樣，製造新的樣本。
  newdat2 <- dta2[sample(1:n2,n2,replace=T),]
  #算變異係數
  
  cresult2[i] <- sd(newdat2)/mean(newdat2)
  
}

quantile(cresult2,c(.025,.975))##95%信賴區間

##      2.5%     97.5% 
## 0.1203109 0.1551773

Question 3

1. 擁有電腦的比率

dta3%>%group_by(paedu)%>%summarize(rate = sum(computer == "YES")/(sum(computer == "YES")+sum(computer == "NO")))

## # A tibble: 5 × 2
##        paedu      rate
##       <fctr>     <dbl>
## 1    college 0.9156939
## 2 elementary 0.7500000
## 3       JunH 0.7051282
## 4       SenH 0.8491228
## 5 University 0.9613493

2. dummy coding + logistic regression 看university(D3)的係數

dta3 <- dta3%>%mutate(D1 = paedu == "JunH",
                      D2 = paedu == "SenH",
                      D3 = paedu == "University",
                      D4 = paedu == "college")
dta3 <- dta3%>%mutate(D1 = as.factor(D1),
                      D2 = as.factor(D2),
                      D3 = as.factor(D3),
                      D4 = as.factor(D4))

summary(r3 <- glm(computer ~ D1 + D2 + D3 + D4, data = dta3 ,family = "binomial"))

## 
## Call:
## glm(formula = computer ~ D1 + D2 + D3 + D4, family = "binomial", 
##     data = dta3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5508   0.2808   0.4197   0.5719   0.8359  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   1.0986     0.8165   1.346   0.1785  
## D1TRUE       -0.2268     0.8534  -0.266   0.7905  
## D2TRUE        0.6291     0.8207   0.767   0.4433  
## D3TRUE        2.1152     0.8280   2.555   0.0106 *
## D4TRUE        1.2866     0.8267   1.556   0.1196  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2111.6  on 3419  degrees of freedom
## Residual deviance: 1982.4  on 3415  degrees of freedom
## AIC: 1992.4
## 
## Number of Fisher Scoring iterations: 6

3. 和教育程度是小學的父母相比，教育程度是大學或是以上的父母，家中有電腦的勝算會多出8.29倍左右

Question 4

1. 平均次數為24/18 = 1.33次

2. 10分鐘內無人發問的機率大約是26.4%

3. 2位(計算看下面)

#1.

24/18

## [1] 1.333333

#2.

ppois(0,(24/18))

## [1] 0.2635971

#3.

qpois(0.8,(24/18))

## [1] 2

Question 5

1. 每日平均到店人數約為35(34.79)人

2. 每日平均到店人數約為14(14.2)人

3. 5*2.5 = 12.5，約13人

4. 5.71公里(計算看下面)

#1. 

exp(3.54934)

## [1] 34.79035

#2.

exp(3.54934-(5*0.17915))

## [1] 14.20494

#3.

5*2.5

## [1] 12.5

#4.

(3.54934-log(12.5))/0.17915

## [1] 5.713711

Question 6

Warning : 看不懂piecewise model的請跳過，那是額外多做來比較的

1. hw simple linear model coefficient = 0.383

2. 不支持，隨著作業越多學期發問次數也越多

3. 越多越好，嚴重懷疑學期發問次數可以測量學生興趣，從piecewise model甚至可以看到超過30題作業之後，每多1題作業學生會多問10題問題，沒有出現習得無助感覺得很神奇

dta6 <- read.table("C:/Users/Cheng_wen_sung/Desktop/16Exam/Q6.txt",header = T)
str(dta6)

## 'data.frame':    500 obs. of  3 variables:
##  $ id : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ hw : int  20 25 17 15 12 21 13 14 17 18 ...
##  $ qno: int  8 8 4 3 3 3 5 0 3 5 ...

# set up linear spline
dta6 <- dta6 %>% 
  mutate( hw_intox = factor(ifelse(hw >= 25, "Yes", "No")),
          hw_above = ifelse(hw < 25, 0, hw - 25))

#ggplot with three fitted line

ggplot(data = dta6,aes(x = hw , y = qno))+
  geom_point(alpha  = 0.5)+
  stat_smooth(method = "lm",se = F,linetype = "dashed")+
  geom_vline(xintercept = 25, linetype = "dotted") +
  stat_smooth(aes(group = hw_intox), method = "lm", se = F) +
  theme_bw()

#simple linear model

summary(r6_1 <- lm(qno ~ hw , data = dta6))

## 
## Call:
## lm(formula = qno ~ hw, data = dta6)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.843 -1.460 -0.311  1.226 34.795 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.43329    0.37038   -6.57 1.27e-10 ***
## hw           0.38298    0.02374   16.13  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.54 on 498 degrees of freedom
## Multiple R-squared:  0.3432, Adjusted R-squared:  0.3419 
## F-statistic: 260.3 on 1 and 498 DF,  p-value: < 2.2e-16

#piecewise model

summary(r6_2 <- segmented(r6_1, seg.Z = ~ hw, psi = 25))

## 
##  ***Regression Model with Segmented Relationship(s)***
## 
## Call: 
## segmented.lm(obj = r6_1, seg.Z = ~hw, psi = 25)
## 
## Estimated Break-Point(s):
##    Est. St.Err 
## 29.616  0.192 
## 
## Meaningful coefficients of the linear terms:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.35864    0.29346   -4.63 4.68e-06 ***
## hw           0.30357    0.01898   15.99  < 2e-16 ***
## U1.hw       10.14687    0.90603   11.20       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.957 on 496 degrees of freedom
## Multiple R-Squared: 0.6118,  Adjusted R-squared: 0.6095 
## 
## Convergence attained in 2 iterations with relative change 1.197219e-16

# plot model fit
plot(r6_2)
with(dta6, points(hw, qno))
abline(v = r6_2$psi[2], lty = 3)
grid()

Question 7

不確定題目是要分開跑glm還是2個一起，以下用的是家庭衝突與學校疏離一起預測是否失眠

家庭衝突分數每增加1分，會讓失眠的勝算增加為原先的4.21倍，學校疏離分數每增加1分會讓失眠的勝算增加為原先的5.41倍

dta7 <- read.table("C:/Users/Cheng_wen_sung/Desktop/16Exam/Q7.txt")
str(dta7)

## 'data.frame':    1976 obs. of  7 variables:
##  $ V1: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ V2: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ V3: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ V4: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ V5: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ V6: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ V7: num  1 1.2 1.2 1.2 1.2 1.4 1.4 1.4 1.4 1.4 ...

colnames(dta7) <- c("insomnia","gender","twelfth","coffee","conflict","supervise","alienation")
dta7 <- dta7%>%mutate(insomnia = as.factor(insomnia),
                      gender = as.factor(gender),
                      twelfth = as.factor(twelfth),
                      coffee = as.factor(coffee))


summary(r7 <- glm(insomnia ~　conflict + conflict:twelfth + twelfth+
                    alienation + alienation:conflict + 
                    alienation:twelfth + alienation:conflict:twelfth,
                  family = "binomial",data = dta7))

## 
## Call:
## glm(formula = insomnia ~ conflict + conflict:twelfth + twelfth + 
##     alienation + alienation:conflict + alienation:twelfth + alienation:conflict:twelfth, 
##     family = "binomial", data = dta7)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0319  -0.5851  -0.4851  -0.3595   2.6817  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -6.5679     1.3671  -4.804 1.55e-06 ***
## conflict                       1.4379     0.6358   2.261   0.0237 *  
## twelfth1                       4.2609     2.5394   1.678   0.0934 .  
## alienation                     1.6882     0.6753   2.500   0.0124 *  
## conflict:twelfth1             -1.1690     1.2215  -0.957   0.3386    
## conflict:alienation           -0.3864     0.3045  -1.269   0.2046    
## twelfth1:alienation           -1.7840     1.3251  -1.346   0.1782    
## conflict:twelfth1:alienation   0.4736     0.6149   0.770   0.4412    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1512.2  on 1975  degrees of freedom
## Residual deviance: 1449.5  on 1968  degrees of freedom
## AIC: 1465.5
## 
## Number of Fisher Scoring iterations: 5

exp(cbind(coef=coef(r7),confint(r7)))

##                                      coef        2.5 %       97.5 %
## (Intercept)                   0.001404685 0.0000916246 1.985185e-02
## conflict                      4.211637446 1.2195944803 1.495185e+01
## twelfth1                     70.871216173 0.4514788840 9.764706e+03
## alienation                    5.409996386 1.4428463254 2.066859e+01
## conflict:twelfth1             0.310677789 0.0292491076 3.585649e+00
## conflict:alienation           0.679522819 0.3696799032 1.232210e+00
## twelfth1:alienation           0.167966851 0.0125365900 2.304951e+00
## conflict:twelfth1:alienation  1.605751713 0.4671521520 5.288049e+00

final_exam

Wen_Sung Cheng

2017年1月16日

Question 1

四步驟法

Sobel test

Bootstrapping

Question 2

Question 3

1. 擁有電腦的比率

2. dummy coding + logistic regression 看university(D3)的係數

3. 和教育程度是小學的父母相比，教育程度是大學或是以上的父母，家中有電腦的勝算會多出8.29倍左右

Question 4

1. 平均次數為24/18 = 1.33次

2. 10分鐘內無人發問的機率大約是26.4%

3. 2位(計算看下面)

Question 5

1. 每日平均到店人數約為35(34.79)人

2. 每日平均到店人數約為14(14.2)人

3. 5*2.5 = 12.5，約13人

4. 5.71公里(計算看下面)

Question 6

Warning : 看不懂piecewise model的請跳過，那是額外多做來比較的

1. hw simple linear model coefficient = 0.383

2. 不支持，隨著作業越多學期發問次數也越多

3. 越多越好，嚴重懷疑學期發問次數可以測量學生興趣，從piecewise model甚至可以看到超過30題作業之後，每多1題作業學生會多問10題問題，沒有出現習得無助感覺得很神奇

Question 7

不確定題目是要分開跑glm還是2個一起，以下用的是家庭衝突與學校疏離一起預測是否失眠

家庭衝突分數每增加1分，會讓失眠的勝算增加為原先的4.21倍，學校疏離分數每增加1分會讓失眠的勝算增加為原先的5.41倍