input data

read in a plain text file with variable names and assign a name to it

dta <- read.table("/Users/haolunfu/Documents/資料管理/week3/math_attainment.txt", header = T)

checking data

structure of data

str(dta)
#> 'data.frame':    39 obs. of  3 variables:
#>  $ math2: int  28 56 51 13 39 41 30 13 17 32 ...
#>  $ math1: int  18 22 44 8 20 12 16 5 9 18 ...
#>  $ cc   : num  328 406 387 167 328 ...

first 6 rows

head(dta)
#>   math2 math1     cc
#> 1    28    18 328.20
#> 2    56    22 406.03
#> 3    51    44 386.94
#> 4    13     8 166.91
#> 5    39    20 328.20
#> 6    41    12 328.20

descriptive statistics

variable mean

colMeans(dta)
#>     math2     math1        cc 
#>  28.76923  15.35897 188.83667

variable sd

apply(dta, 2, sd)
#>     math2     math1        cc 
#> 10.720029  7.744224 84.842513

correlation matrix

cor(dta)
#>           math2     math1        cc
#> math2 1.0000000 0.7443604 0.6570098
#> math1 0.7443604 1.0000000 0.5956771
#> cc    0.6570098 0.5956771 1.0000000

plot data

specify square plot region

par(pty="s")

scatter plot of math2 by math1

plot(math2 ~ math1, data=dta, xlim=c(0, 60), ylim=c(0, 60),
     xlab="Math score at Year 1", ylab="Math score at Year 2")
# add grid lines
grid()


## regression analysis

# regress math2 by math1
dta.lm <- lm(math2 ~ math1, data=dta)


# add regression line

abline(dta.lm, lty=2)


# add plot title
title("Mathematics Attainment")

regression analysis

regress math2 by math1

# show results
summary(dta.lm)
#> 
#> Call:
#> lm(formula = math2 ~ math1, data = dta)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -10.430  -5.521  -0.369   4.253  20.388 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)   12.944      2.607   4.965 1.57e-05 ***
#> math1          1.030      0.152   6.780 5.57e-08 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 7.255 on 37 degrees of freedom
#> Multiple R-squared:  0.5541, Adjusted R-squared:  0.542 
#> F-statistic: 45.97 on 1 and 37 DF,  p-value: 5.571e-08

# show anova table
anova(dta.lm)
#> Analysis of Variance Table
#> 
#> Response: math2
#>           Df Sum Sq Mean Sq F value    Pr(>F)    
#> math1      1 2419.6 2419.59  45.973 5.571e-08 ***
#> Residuals 37 1947.3   52.63                      
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

diagnostics

specify maximum plot region

par(pty="m")

#
plot(scale(resid(dta.lm)) ~ fitted(dta.lm), 
     ylim=c(-3.5, 3.5), type="n",
     xlab="Fitted values", ylab="Standardized residuals") 

#
text(fitted(dta.lm), scale(resid(dta.lm)), labels=rownames(dta), cex=0.5)  

#
grid()

# add a horizontal red dash line
abline(h=0, lty=2, col="red")

normality check

#
qqnorm(scale(resid(dta.lm)))

qqline(scale(resid(dta.lm)))

grid()

math_attainment

Hao-Lun Fu

2020-03-16

input data

read in a plain text file with variable names and assign a name to it

checking data

structure of data

first 6 rows

descriptive statistics

variable mean

variable sd

correlation matrix

plot data

specify square plot region

scatter plot of math2 by math1

regression analysis

regress math2 by math1

diagnostics

specify maximum plot region

normality check