#
# first R session using math attainment data set
#

## input data

# read in a plain text file with variable names and assign a name to it
dta <- read.table("C:/Users/USER/Desktop/1/math_attainment.txt", header = T)

## checking data

# structure of data
str(dta)
## 'data.frame':    39 obs. of  3 variables:
##  $ math2: int  28 56 51 13 39 41 30 13 17 32 ...
##  $ math1: int  18 22 44 8 20 12 16 5 9 18 ...
##  $ cc   : num  328 406 387 167 328 ...
# first 6 rows
head(dta)
##   math2 math1     cc
## 1    28    18 328.20
## 2    56    22 406.03
## 3    51    44 386.94
## 4    13     8 166.91
## 5    39    20 328.20
## 6    41    12 328.20
## descriptive statistics

# variable mean
colMeans(dta)
##     math2     math1        cc 
##  28.76923  15.35897 188.83667
# variable sd
apply(dta, 2, sd)
##     math2     math1        cc 
## 10.720029  7.744224 84.842513
# correlation matrix
cor(dta)
##           math2     math1        cc
## math2 1.0000000 0.7443604 0.6570098
## math1 0.7443604 1.0000000 0.5956771
## cc    0.6570098 0.5956771 1.0000000
## plot data

# specify square plot region
par(pty="s")

# scatter plot of math2 by math1
plot(math2 ~ math1, data=dta, xlim=c(0, 60), ylim=c(0, 60),
     xlab="Math score at Year 1", ylab="Math score at Year 2")

# add grid lines
grid()

## regression analysis

# regress math2 by math1
dta.lm <- lm(math2 ~ math1, data=dta)

# show results
summary(dta.lm)
## 
## Call:
## lm(formula = math2 ~ math1, data = dta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.430  -5.521  -0.369   4.253  20.388 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   12.944      2.607   4.965 1.57e-05 ***
## math1          1.030      0.152   6.780 5.57e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.255 on 37 degrees of freedom
## Multiple R-squared:  0.5541, Adjusted R-squared:  0.542 
## F-statistic: 45.97 on 1 and 37 DF,  p-value: 5.571e-08
# show anova table
anova(dta.lm)
## Analysis of Variance Table
## 
## Response: math2
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## math1      1 2419.6 2419.59  45.973 5.571e-08 ***
## Residuals 37 1947.3   52.63                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# add regression line
abline(dta.lm, lty=2)

# add plot title
title("Mathematics Attainment")

## diagnostics

# specify maximum plot region
par(pty="m")

#
plot(scale(resid(dta.lm)) ~ fitted(dta.lm), 
     ylim=c(-3.5, 3.5), type="n",
     xlab="Fitted values", ylab="Standardized residuals") 

#
text(fitted(dta.lm), scale(resid(dta.lm)), labels=rownames(dta), cex=0.5)  

#
grid()

# add a horizontal red dash line
abline(h=0, lty=2, col="red")

## normality check

#
qqnorm(scale(resid(dta.lm)))

qqline(scale(resid(dta.lm)))

grid()

###