The math attainment page has a dataset and a script of R code chunks. Generate a markdown file from the script to push the output in HTML for posting to course Moodle site.

first R session using math attainment data set

input data

read in a plain text file with variable names and assign a name to it

把資料讀到dta <- read.table(“D:/User/Desktop/datamanagement/20211018/math_attainment.txt”, header = T) 檔案有title 的話,一定要加 header=T(true)

dta <- read.table("D:/User/Desktop/datamanagement/20211018/math_attainment.txt", header = T)
dta
##    math2 math1     cc
## 1     28    18 328.20
## 2     56    22 406.03
## 3     51    44 386.94
## 4     13     8 166.91
## 5     39    20 328.20
## 6     41    12 328.20
## 7     30    16 160.28
## 8     13     5  94.94
## 9     17     9  98.20
## 10    32    18 331.62
## 11    32    20 328.20
## 12    46    30 191.36
## 13    37    10 191.36
## 14    36    16 191.36
## 15    27    14 211.11
## 16    23    14 131.25
## 17    23     9 131.25
## 18    44    24 203.54
## 19    17     8 211.11
## 20    23     5  44.72
## 21    17    10 135.08
## 22    27    13 150.39
## 23    21     7  85.26
## 24    18    14 129.82
## 25    47    21 187.43
## 26    21     4 129.82
## 27    34    16 144.25
## 28    26    12 141.15
## 29    16    10 115.47
## 30    27    16 129.82
## 31    23    11 155.67
## 32    35    16 187.43
## 33    31    16 141.15
## 34    20    14 189.14
## 35    23    12 187.43
## 36    37    29 187.43
## 37    31    25 189.68
## 38    19    16 183.61
## 39    21    15 129.82

checking data

structure of data

str(dta)
## 'data.frame':    39 obs. of  3 variables:
##  $ math2: int  28 56 51 13 39 41 30 13 17 32 ...
##  $ math1: int  18 22 44 8 20 12 16 5 9 18 ...
##  $ cc   : num  328 406 387 167 328 ...

dta裡面有3個變數、39個觀察值

first 6 rows

head(dta)
##   math2 math1     cc
## 1    28    18 328.20
## 2    56    22 406.03
## 3    51    44 386.94
## 4    13     8 166.91
## 5    39    20 328.20
## 6    41    12 328.20

列出前六個觀察值

summary(dta$math1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00   10.00   14.00   15.36   18.00   44.00
summary(dta$math2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   21.00   27.00   28.77   35.50   56.00
summary(dta$cc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   44.72  131.25  183.61  188.84  197.45  406.03

做變項(math1, math2, cc)的總覽

descriptive statistics

variable mean

colMeans(dta)
##     math2     math1        cc 
##  28.76923  15.35897 188.83667

計算column的平均值

variable sd

apply(dta, 2, sd)
##     math2     math1        cc 
## 10.720029  7.744224 84.842513

在上面的 apply() 中,dta 是我們的資料 (矩陣),1 代表 apply 的計算是 by row,sd 是告訴 apply 我們要算每個列的sd。

apply(dta, 1, sum)
##  [1] 374.20 484.03 481.94 187.91 387.20 381.20 206.28 112.94 124.20 381.62
## [11] 380.20 267.36 238.36 243.36 252.11 168.25 163.25 271.54 236.11  72.72
## [21] 162.08 190.39 113.26 161.82 255.43 154.82 194.25 179.15 141.47 172.82
## [31] 189.67 238.43 188.15 223.14 222.43 253.43 245.68 218.61 165.82

2 告訴 apply 這次的計算要 by column ,sum代表加總,應該會有39筆資料(39個觀察值)。 這部分是嘗試apply中1, 2的差別。

correlation matrix

cor(dta)
##           math2     math1        cc
## math2 1.0000000 0.7443604 0.6570098
## math1 0.7443604 1.0000000 0.5956771
## cc    0.6570098 0.5956771 1.0000000

做dta資料變項的相關性。

plot data

specify square plot region

par(pty="s")

scatter plot of math2 by math1

plot(math2 ~ math1, data=dta, xlim=c(0, 60), ylim=c(0, 60),
     xlab="Math score at Year 1", ylab="Math score at Year 2")

add grid lines

grid()

plot(math2 ~ math1, data=dta, xlim=c(0, 60), ylim=c(0, 60),
     xlab="Math score at Year 1", ylab="Math score at Year 2")
grid()  #加上背景參考線
abline(h=mean(dta$math2), lty=2) #加上math2的平均值當橫線

regression analysis

regress math2 by math1

dta.lm <- lm(math2 ~ math1, data=dta)

做math2和math1的迴歸分析,命名為dta,lm # show results

summary(dta.lm)
## 
## Call:
## lm(formula = math2 ~ math1, data = dta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.430  -5.521  -0.369   4.253  20.388 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   12.944      2.607   4.965 1.57e-05 ***
## math1          1.030      0.152   6.780 5.57e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.255 on 37 degrees of freedom
## Multiple R-squared:  0.5541, Adjusted R-squared:  0.542 
## F-statistic: 45.97 on 1 and 37 DF,  p-value: 5.571e-08

show anova table

anova(dta.lm)
## Analysis of Variance Table
## 
## Response: math2
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## math1      1 2419.6 2419.59  45.973 5.571e-08 ***
## Residuals 37 1947.3   52.63                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

add regression line

plot(math2 ~ math1, data=dta, xlim=c(0, 60), ylim=c(0, 60),
     xlab="Math score at Year 1", ylab="Math score at Year 2")
abline(dta.lm, lty=2)

做Math1和Math2的plot,並加上dta.lm這條迴歸線。 # add plot title

plot(math2 ~ math1, data=dta, xlim=c(0, 60), ylim=c(0, 60),
     xlab="Math score at Year 1", ylab="Math score at Year 2")
abline(dta.lm, lty=2)
title("Mathematics Attainment with the Regression Line")

加上圖的標題title(“Mathematics Attainment”)

diagnostics

specify maximum plot region

par(pty=“m”)

#
par(pty="m")
plot(scale(resid(dta.lm)) ~ fitted(dta.lm), 
     ylim=c(-3.5, 3.5), type="n",
     xlab="Fitted values", ylab="Standardized residuals") 

#give the value and lable
text(fitted(dta.lm), scale(resid(dta.lm)), labels=rownames(dta), cex=0.5)  
grid()#背景參考線

# add a horizontal red dash line
abline(h=0, lty=2, col="red")

## normality check
qqnorm(scale(resid(dta.lm))) #殘差的qqplot
qqline(scale(resid(dta.lm))) #殘差的迴歸線
grid()