R fundamentals: In-class Exercise 1

2020-Spring [Data Management] Instructor: SHEU, Ching-Fan

CHIU, Ming-Tzu

2020-03-29

The math attainment page has a dataset and a script of R code chunks. Generate a markdown file from the script to push the output in HTML for posting to course Moodle site.

first R session using math attainment data set

input data

read in a plain text file with variable names and assign a name to it

dta <- read.table("C:/Users/TheorEco Lab/Desktop/108-2/DataManagement/0323/exercise1/math_attainment.txt", header = T)

checking data

structure of data

str(dta)
#> 'data.frame':    39 obs. of  3 variables:
#>  $ math2: int  28 56 51 13 39 41 30 13 17 32 ...
#>  $ math1: int  18 22 44 8 20 12 16 5 9 18 ...
#>  $ cc   : num  328 406 387 167 328 ...

first 6 rows

head(dta)
#>   math2 math1     cc
#> 1    28    18 328.20
#> 2    56    22 406.03
#> 3    51    44 386.94
#> 4    13     8 166.91
#> 5    39    20 328.20
#> 6    41    12 328.20

descriptive statistics

variable mean

colMeans(dta)
#>     math2     math1        cc 
#>  28.76923  15.35897 188.83667

variable sd

apply(dta, 2, sd)
#>     math2     math1        cc 
#> 10.720029  7.744224 84.842513

correlation matrix

cor(dta)
#>           math2     math1        cc
#> math2 1.0000000 0.7443604 0.6570098
#> math1 0.7443604 1.0000000 0.5956771
#> cc    0.6570098 0.5956771 1.0000000

summary of descriptive statistics

dta.ds <- rbind(length(dta), colMeans(dta), apply(dta, 2, sd))
row.names(dta.ds) <- c("n","M","SD")
dta.ds
#>       math2     math1        cc
#> n   3.00000  3.000000   3.00000
#> M  28.76923 15.358974 188.83667
#> SD 10.72003  7.744224  84.84251

cor(dta)
#>           math2     math1        cc
#> math2 1.0000000 0.7443604 0.6570098
#> math1 0.7443604 1.0000000 0.5956771
#> cc    0.6570098 0.5956771 1.0000000

Column 1: Mathematics attainment, end of Year 2 Column 2: Mathematics attainment, end of Year 1 Column 3: Curriculum coverage

plot data

specify square plot region

par(pty="s")

scatter plot of math2 by math1 and add grid lines

plot(math2 ~ math1, data=dta, xlim=c(0, 60), ylim=c(0, 60),
     xlab="Math score at Year 1", ylab="Math score at Year 2")

grid()

regression analysis

regress math2 by math1

dta.lm <- lm(math2 ~ math1, data=dta)

show results

summary(dta.lm)
#> 
#> Call:
#> lm(formula = math2 ~ math1, data = dta)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -10.430  -5.521  -0.369   4.253  20.388 
#> 
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)   12.944      2.607   4.965 1.57e-05 ***
#> math1          1.030      0.152   6.780 5.57e-08 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 7.255 on 37 degrees of freedom
#> Multiple R-squared:  0.5541, Adjusted R-squared:  0.542 
#> F-statistic: 45.97 on 1 and 37 DF,  p-value: 5.571e-08

show anova table

anova(dta.lm)
#> Analysis of Variance Table
#> 
#> Response: math2
#>           Df Sum Sq Mean Sq F value    Pr(>F)    
#> math1      1 2419.6 2419.59  45.973 5.571e-08 ***
#> Residuals 37 1947.3   52.63                      
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

add regression line

plot(math2 ~ math1, data=dta, xlim=c(0, 60), ylim=c(0, 60),
     xlab="Math score at Year 1", ylab="Math score at Year 2")
grid()

abline(dta.lm, lty=2)

add plot title

plot(math2 ~ math1, data=dta, xlim=c(0, 60), ylim=c(0, 60),
     xlab="Math score at Year 1", ylab="Math score at Year 2")
grid()
abline(dta.lm, lty=2)

title("Mathematics Attainment")

diagnostics

specify maximum plot region

par(pty="m")
plot(scale(resid(dta.lm)) ~ fitted(dta.lm), 
     ylim=c(-3.5, 3.5), type="n",
     xlab="Fitted values", ylab="Standardized residuals")
text(fitted(dta.lm), scale(resid(dta.lm)), labels=rownames(dta), cex=0.5)
grid()

add a horizontal red dash line

plot(scale(resid(dta.lm)) ~ fitted(dta.lm), 
     ylim=c(-3.5, 3.5), type="n",
     xlab="Fitted values", ylab="Standardized residuals")
text(fitted(dta.lm), scale(resid(dta.lm)), labels=rownames(dta), cex=0.5)
grid()

abline(h=0, lty=2, col="red")

normality check

qqnorm(scale(resid(dta.lm)))
qqline(scale(resid(dta.lm)))
grid()