T.test和迴歸語法

##借用語法
library(dplyr)
library(ggplot2) 
library(viridis) 
library(ggthemes)
library(tidyr)
library(hrbrthemes)
library(tidyverse)
library(lattice)

dta <- read.table("C:/tmp/hs0.txt",h=T, as.is=T)
##看存檔位置
getwd()

## [1] "C:/tmp"

##查看檔案型式
str(dta)

## 'data.frame':    200 obs. of  11 variables:
##  $ id     : int  70 121 86 141 172 113 50 11 84 48 ...
##  $ female : chr  "male" "female" "male" "male" ...
##  $ race   : chr  "white" "white" "white" "white" ...
##  $ ses    : chr  "low" "middle" "high" "high" ...
##  $ schtyp : chr  "public" "public" "public" "public" ...
##  $ prog   : chr  "general" "vocation" "general" "vocation" ...
##  $ read   : int  57 68 44 63 47 44 50 34 63 57 ...
##  $ write  : int  52 59 33 44 52 52 59 46 57 55 ...
##  $ math   : int  41 53 54 47 57 51 42 45 54 52 ...
##  $ science: int  47 63 58 53 53 63 53 39 58 NA ...
##  $ socst  : int  57 61 31 56 61 61 61 36 51 51 ...

##查看檔案行列
dim(dta)

## [1] 200  11

##查每個變項名稱
names(dta)

##  [1] "id"      "female"  "race"    "ses"     "schtyp"  "prog"    "read"   
##  [8] "write"   "math"    "science" "socst"

##檔案的型態
class(dta)

## [1] "data.frame"

##以t-test獨立樣本比較各個變項的差異，但是語法太長了，不知道能不能用function寫t-test自動跑各個變項
#各個p值均高於.05，可以解釋每個變項的平均值沒有顯著差異
t.test(dta$read, dta$write, data=dta)

## 
##  Welch Two Sample t-test
## 
## data:  dta$read and dta$write
## t = -0.55199, df = 395.57, p-value = 0.5813
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.486081  1.396081
## sample estimates:
## mean of x mean of y 
##    52.230    52.775

t.test(dta$read, dta$science, data=dta)

## 
##  Welch Two Sample t-test
## 
## data:  dta$read and dta$science
## t = 0.30932, df = 392.84, p-value = 0.7572
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.671320  2.295423
## sample estimates:
## mean of x mean of y 
##  52.23000  51.91795

t.test(dta$read, dta$math, data=dta)

## 
##  Welch Two Sample t-test
## 
## data:  dta$read and dta$math
## t = -0.42258, df = 394.8, p-value = 0.6728
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.345732  1.515732
## sample estimates:
## mean of x mean of y 
##    52.230    52.645

t.test(dta$read, dta$socst, data=dta)

## 
##  Welch Two Sample t-test
## 
## data:  dta$read and dta$socst
## t = -0.16671, df = 397.16, p-value = 0.8677
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.238691  1.888691
## sample estimates:
## mean of x mean of y 
##    52.230    52.405

t.test(dta$write, dta$science, data=dta)

## 
##  Welch Two Sample t-test
## 
## data:  dta$write and dta$science
## t = 0.88336, df = 391.67, p-value = 0.3776
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.050445  2.764547
## sample estimates:
## mean of x mean of y 
##  52.77500  51.91795

t.test(dta$write, dta$math, data=dta)

## 
##  Welch Two Sample t-test
## 
## data:  dta$write and dta$math
## t = 0.13795, df = 397.95, p-value = 0.8903
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.722641  1.982641
## sample estimates:
## mean of x mean of y 
##    52.775    52.645

t.test(dta$write, dta$socst, data=dta)

## 
##  Welch Two Sample t-test
## 
## data:  dta$write and dta$socst
## t = 0.36537, df = 391.98, p-value = 0.715
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.620948  2.360948
## sample estimates:
## mean of x mean of y 
##    52.775    52.405

t.test(dta$science, dta$math, data=dta)

## 
##  Welch Two Sample t-test
## 
## data:  dta$science and dta$math
## t = -0.75353, df = 391.09, p-value = 0.4516
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.624011  1.169909
## sample estimates:
## mean of x mean of y 
##  51.91795  52.64500

t.test(dta$science, dta$socst, data=dta)

## 
##  Welch Two Sample t-test
## 
## data:  dta$science and dta$socst
## t = -0.4712, df = 391.29, p-value = 0.6378
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.519224  1.545121
## sample estimates:
## mean of x mean of y 
##  51.91795  52.40500

t.test(dta$math, dta$socst, data=dta)

## 
##  Welch Two Sample t-test
## 
## data:  dta$math and dta$socst
## t = 0.23821, df = 390.83, p-value = 0.8118
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.740865  2.220865
## sample estimates:
## mean of x mean of y 
##    52.645    52.405

##先切出race，再以迴歸比較read和write的差異(正確的)
#和下方語法很像，但結果不同
m0 <- lapply(split(dta, (dta$race)), 
             function(x) lm(x$read ~ x$write))
sapply(m0, coef)

##             african-amer      asian   hispanic      white
## (Intercept)   23.7450339 29.0437063 13.8832561 19.3080838
## x$write        0.4783188  0.3942308  0.7056519  0.6403838

##這個語法應該也是讓read和write跑迴歸但是數值卻是一樣的(錯誤語法)
m0 <- with(dta, by(dta, dta$race, function(x) lm(dta$read ~ dta$write, data=dta)))
sapply(m0, coef)

##             african-amer    asian hispanic    white
## (Intercept)     18.16215 18.16215 18.16215 18.16215
## dta$write        0.64553  0.64553  0.64553  0.64553

##依序比較和face的各個關係，值均高於.05
m0 <- lapply(split(dta, (dta$race)), 
             function(x) lm(x$read ~ x$science))
sapply(m0, coef)

##             african-amer     asian  hispanic    white
## (Intercept)   25.5547928 30.175414 5.1546176 18.23467
## x$science      0.4881625  0.422386 0.9054034  0.65570

m0 <- lapply(split(dta, (dta$race)), 
             function(x) lm(x$read ~ x$math))
sapply(m0, coef)

##             african-amer      asian  hispanic     white
## (Intercept)   12.2526415 26.3941062 8.4430991 15.355125
## x$math         0.7389809  0.4454997 0.8061209  0.714606

m0 <- lapply(split(dta, (dta$race)), 
             function(x) lm(x$read ~ x$socst))
sapply(m0, coef)

##             african-amer      asian   hispanic      white
## (Intercept)   22.3330875 25.0669856 19.4594864 23.2191508
## x$socst        0.4947808  0.5263158  0.5692871  0.5719711

##colMeans計算整個行列的平均，sweep計算行列
aves <- colMeans(dta[,7:11], na.rm=T)
sweep(dta[,7:11], 2, aves)[1:6,]

##    read   write    math   science   socst
## 1  4.77  -0.775 -11.645 -4.917949   4.595
## 2 15.77   6.225   0.355 11.082051   8.595
## 3 -8.23 -19.775   1.355  6.082051 -21.405
## 4 10.77  -8.775  -5.645  1.082051   3.595
## 5 -5.23  -0.775   4.355  1.082051   8.595
## 6 -8.23  -0.775  -1.645 11.082051   8.595

##固定read和write項目跑迴歸，p值小於.05，高迴歸代表預測性高?!
y <- dta[, "read"]
res <- sapply(dta[, 7:11], function(x) lm(y ~ x))
lapply(res[1:2], broom::tidy)

## $read
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept) 4.02e-14  2.19e-15   1.84e 1 1.08e-44
## 2 x           1.00e+ 0  4.11e-17   2.43e16 0.      
## 
## $write
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   18.2      3.31        5.49 1.21e- 7
## 2 x              0.646    0.0617     10.5  1.11e-20

##固定read和math項目跑迴歸，p值小於.05，高迴歸
y <- dta[, "read"]
res <- sapply(dta[, 8:11], function(x) lm(y ~ x))
lapply(res[1:2], broom::tidy)

## $write
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   18.2      3.31        5.49 1.21e- 7
## 2 x              0.646    0.0617     10.5  1.11e-20
## 
## $math
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   14.1      3.12        4.52 1.08e- 5
## 2 x              0.725    0.0583     12.4  1.28e-26

##固定read和science項目跑迴歸，p值小於.05，高迴歸
y <- dta[, "read"]
res <- sapply(dta[, 9:11], function(x) lm(y ~ x))
lapply(res[1:2], broom::tidy)

## $math
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   14.1      3.12        4.52 1.08e- 5
## 2 x              0.725    0.0583     12.4  1.28e-26
## 
## $science
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   18.2      3.10        5.87 1.91e- 8
## 2 x              0.654    0.0586     11.2  1.22e-22

##固定read和socst項目跑迴歸，p值小於.05，高迴歸
y <- dta[, "read"]
res <- sapply(dta[, 10:11], function(x) lm(y ~ x))
lapply(res[1:2], broom::tidy)

## $science
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   18.2      3.10        5.87 1.91e- 8
## 2 x              0.654    0.0586     11.2  1.22e-22
## 
## $socst
## # A tibble: 2 x 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)   21.1      2.84        7.43 3.22e-12
## 2 x              0.594    0.0532     11.2  9.29e-23

##若比較不同race的read和socst項目跑迴歸，p值小於.05，高迴歸，個比較語法如下
test_by_race <- paste0(names(dta[7:11]), " ~ race")
formulae <- lapply(test_by_race, FUN=formula)
res_aov <- lapply(formulae, function(f) lm(f, data=dta))
lapply(res_aov[1:2], anova)

## [[1]]
## Analysis of Variance Table
## 
## Response: read
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## race        3  1749.8  583.27  5.9637 0.0006539 ***
## Residuals 196 19169.6   97.80                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[2]]
## Analysis of Variance Table
## 
## Response: write
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## race        3  1914.2  638.05  7.8334 5.785e-05 ***
## Residuals 196 15964.7   81.45                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

test_by_race <- paste0(names(dta[8:11]), " ~ race")
formulae <- lapply(test_by_race, FUN=formula)
res_aov <- lapply(formulae, function(f) lm(f, data=dta))
lapply(res_aov[1:2], anova)

## [[1]]
## Analysis of Variance Table
## 
## Response: write
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## race        3  1914.2  638.05  7.8334 5.785e-05 ***
## Residuals 196 15964.7   81.45                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[2]]
## Analysis of Variance Table
## 
## Response: math
##            Df  Sum Sq Mean Sq F value   Pr(>F)    
## race        3  1842.1  614.05  7.7033 6.84e-05 ***
## Residuals 196 15623.7   79.71                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

test_by_race <- paste0(names(dta[9:11]), " ~ race")
formulae <- lapply(test_by_race, FUN=formula)
res_aov <- lapply(formulae, function(f) lm(f, data=dta))
lapply(res_aov[1:2], anova)

## [[1]]
## Analysis of Variance Table
## 
## Response: math
##            Df  Sum Sq Mean Sq F value   Pr(>F)    
## race        3  1842.1  614.05  7.7033 6.84e-05 ***
## Residuals 196 15623.7   79.71                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[2]]
## Analysis of Variance Table
## 
## Response: science
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## race        3  3169.5 1056.51  13.063 8.505e-08 ***
## Residuals 191 15447.2   80.88                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

test_by_race <- paste0(names(dta[10:11]), " ~ race")
formulae <- lapply(test_by_race, FUN=formula)
res_aov <- lapply(formulae, function(f) lm(f, data=dta))
lapply(res_aov[1:2], anova)

## [[1]]
## Analysis of Variance Table
## 
## Response: science
##            Df  Sum Sq Mean Sq F value    Pr(>F)    
## race        3  3169.5 1056.51  13.063 8.505e-08 ***
## Residuals 191 15447.2   80.88                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[2]]
## Analysis of Variance Table
## 
## Response: socst
##            Df  Sum Sq Mean Sq F value  Pr(>F)  
## race        3   943.9  314.63   2.804 0.04098 *
## Residuals 196 21992.3  112.21                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

T.test和迴歸語法

KY

2020-05-06