knitr::opts_chunk$set(echo = TRUE)
setwd("C:/Users/Admin/Desktop/data set computing")
getwd()
## [1] "C:/Users/Admin/Desktop/data set computing"
hsb2 <- read.csv("hsb2.csv")
# 1.Data overview
#Structure and dimensions
str(hsb2)
## 'data.frame':    200 obs. of  11 variables:
##  $ id     : int  70 121 86 141 172 113 50 11 84 48 ...
##  $ female : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ race   : int  4 4 4 4 4 4 3 1 4 3 ...
##  $ ses    : int  1 2 3 3 2 2 2 2 2 2 ...
##  $ schtyp : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ prog   : int  1 3 1 3 2 2 1 2 1 2 ...
##  $ read   : int  57 68 44 63 47 44 50 34 63 57 ...
##  $ write  : int  52 59 33 44 52 52 59 46 57 55 ...
##  $ math   : int  41 53 54 47 57 51 42 45 54 52 ...
##  $ science: int  47 63 58 53 53 63 53 39 58 50 ...
##  $ socst  : int  57 61 31 56 61 61 61 36 51 51 ...
dim(hsb2)
## [1] 200  11
summary(hsb2)
##        id             female           race           ses            schtyp    
##  Min.   :  1.00   Min.   :0.000   Min.   :1.00   Min.   :1.000   Min.   :1.00  
##  1st Qu.: 50.75   1st Qu.:0.000   1st Qu.:3.00   1st Qu.:2.000   1st Qu.:1.00  
##  Median :100.50   Median :1.000   Median :4.00   Median :2.000   Median :1.00  
##  Mean   :100.50   Mean   :0.545   Mean   :3.43   Mean   :2.055   Mean   :1.16  
##  3rd Qu.:150.25   3rd Qu.:1.000   3rd Qu.:4.00   3rd Qu.:3.000   3rd Qu.:1.00  
##  Max.   :200.00   Max.   :1.000   Max.   :4.00   Max.   :3.000   Max.   :2.00  
##       prog            read           write            math      
##  Min.   :1.000   Min.   :28.00   Min.   :31.00   Min.   :33.00  
##  1st Qu.:2.000   1st Qu.:44.00   1st Qu.:45.75   1st Qu.:45.00  
##  Median :2.000   Median :50.00   Median :54.00   Median :52.00  
##  Mean   :2.025   Mean   :52.23   Mean   :52.77   Mean   :52.65  
##  3rd Qu.:2.250   3rd Qu.:60.00   3rd Qu.:60.00   3rd Qu.:59.00  
##  Max.   :3.000   Max.   :76.00   Max.   :67.00   Max.   :75.00  
##     science          socst      
##  Min.   :26.00   Min.   :26.00  
##  1st Qu.:44.00   1st Qu.:46.00  
##  Median :53.00   Median :52.00  
##  Mean   :51.85   Mean   :52.41  
##  3rd Qu.:58.00   3rd Qu.:61.00  
##  Max.   :74.00   Max.   :71.00

Including Plots

You can also embed plots, for example:

knitr::opts_chunk$set(echo = TRUE)
# 2.Linear models
#regressin analysis
model1 <-lm( math ~ read + write , data = hsb2)
summary(model1)
## 
## Call:
## lm(formula = math ~ read + write, data = hsb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.8478  -4.6996   0.1016   4.4756  16.0483 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.86507    2.82162   4.559 9.00e-06 ***
## read         0.41695    0.05648   7.382 4.29e-12 ***
## write        0.34112    0.06110   5.583 7.76e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.555 on 197 degrees of freedom
## Multiple R-squared:  0.5153, Adjusted R-squared:  0.5104 
## F-statistic: 104.7 on 2 and 197 DF,  p-value: < 2.2e-16
knitr::opts_chunk$set(echo = TRUE)
hsb2$prog <- as.factor(hsb2$prog)
anova_model1 <- aov(math ~ prog , data = hsb2)
summary(anova_model1)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## prog          2   4002  2001.1   29.28 7.36e-12 ***
## Residuals   197  13464    68.3                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
knitr::opts_chunk$set(echo = TRUE)
#ANCOVA
hsb2$ses <- as.factor(hsb2$ses)
ancova_model <- lm( math  ~ prog + ses , data = hsb2)
summary(ancova_model)
## 
## Call:
## lm(formula = math ~ prog + ses, data = hsb2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.255  -6.656  -1.079   5.644  28.097 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   47.781      1.504  31.768  < 2e-16 ***
## prog2          5.865      1.483   3.954 0.000107 ***
## prog3         -3.847      1.687  -2.281 0.023653 *  
## ses2           2.968      1.464   2.027 0.043991 *  
## ses3           4.608      1.644   2.803 0.005575 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.143 on 195 degrees of freedom
## Multiple R-squared:  0.2597, Adjusted R-squared:  0.2445 
## F-statistic:  17.1 on 4 and 195 DF,  p-value: 4.893e-12

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

knitr::opts_chunk$set(echo = TRUE)
# 3.Diagnostics

# residual analysis
par(mfrow = c(2,2))
plot(pressure)

plot(model1)   # you will have 4 plots 

knitr::opts_chunk$set(echo = TRUE)
# transformations for non-normalities

shapiro.test(residuals(model1)) #if it's > 0.05 it's normal
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model1)
## W = 0.99548, p-value = 0.8174
#transformations
model_log <- lm(log(math) ~read + write , data = hsb2) 
summary(model_log)
## 
## Call:
## lm(formula = log(math) ~ read + write, data = hsb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45088 -0.08728  0.00274  0.09001  0.31941 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.191710   0.053994  59.113  < 2e-16 ***
## read        0.007757   0.001081   7.176 1.42e-11 ***
## write       0.006650   0.001169   5.688 4.60e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1254 on 197 degrees of freedom
## Multiple R-squared:  0.511,  Adjusted R-squared:  0.5061 
## F-statistic:   103 on 2 and 197 DF,  p-value: < 2.2e-16
knitr::opts_chunk$set(echo = TRUE)
model_sqrt <- lm(sqrt(math) ~ read + write , data = hsb2)
summary(model_sqrt)  # NOTE: the log and sqrt is to reduce skewness you can use any
## 
## Call:
## lm(formula = sqrt(math) ~ read + write, data = hsb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.52687 -0.32656  0.00255  0.30148  1.12995 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.492517   0.194074  23.148  < 2e-16 ***
## read        0.028358   0.003885   7.299 6.96e-12 ***
## write       0.023753   0.004202   5.652 5.50e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4509 on 197 degrees of freedom
## Multiple R-squared:  0.5146, Adjusted R-squared:  0.5097 
## F-statistic: 104.4 on 2 and 197 DF,  p-value: < 2.2e-16
knitr::opts_chunk$set(echo = TRUE)
# collinearity
#correlation
cor(hsb2[c("read" , "write" , "math")])
##            read     write      math
## read  1.0000000 0.5967765 0.6622801
## write 0.5967765 1.0000000 0.6174493
## math  0.6622801 0.6174493 1.0000000
install.packages("car", repos = "https://cloud.r-project.org")
## Installing package into 'C:/Users/Admin/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'car' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Admin\AppData\Local\Temp\Rtmpchgso3\downloaded_packages
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
model_log <- lm(math ~ read + write, data = hsb2)
#VIF(variance inflation factor)
vif(model_log)
##     read    write 
## 1.553138 1.553138
knitr::opts_chunk$set(echo = TRUE)
# 4.Non-linear model

#polynomial regression
#quadratic model
model_quad <- lm(math ~ poly(read , 2) + poly(write, 2), data = hsb2)
summary(model_quad)
## 
## Call:
## lm(formula = math ~ poly(read, 2) + poly(write, 2), data = hsb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.2606  -4.5697  -0.5615   4.7927  16.6931 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      52.6450     0.4574 115.106  < 2e-16 ***
## poly(read, 2)1   56.8236     8.1626   6.961 5.01e-11 ***
## poly(read, 2)2    2.2050     6.5841   0.335   0.7381    
## poly(write, 2)1  47.8416     8.1167   5.894 1.63e-08 ***
## poly(write, 2)2  17.2451     6.6407   2.597   0.0101 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.468 on 195 degrees of freedom
## Multiple R-squared:  0.5329, Adjusted R-squared:  0.5233 
## F-statistic: 55.62 on 4 and 195 DF,  p-value: < 2.2e-16
knitr::opts_chunk$set(echo = TRUE)
model_cubic <- lm(math ~ poly(read, 3) + poly(write, 3), data = hsb2)
summary(model_cubic)
## 
## Call:
## lm(formula = math ~ poly(read, 3) + poly(write, 3), data = hsb2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.3843  -4.5223  -0.7039   4.6521  16.4599 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      52.6450     0.4593 114.624  < 2e-16 ***
## poly(read, 3)1   57.0523     8.2090   6.950 5.48e-11 ***
## poly(read, 3)2    2.4778     6.6338   0.374   0.7092    
## poly(read, 3)3   -2.2284     6.5179  -0.342   0.7328    
## poly(write, 3)1  47.5904     8.1758   5.821 2.40e-08 ***
## poly(write, 3)2  17.1033     6.6741   2.563   0.0111 *  
## poly(write, 3)3  -3.2977     6.5187  -0.506   0.6135    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.495 on 193 degrees of freedom
## Multiple R-squared:  0.5338, Adjusted R-squared:  0.5193 
## F-statistic: 36.83 on 6 and 193 DF,  p-value: < 2.2e-16
knitr::opts_chunk$set(echo = TRUE)
#model comparison ANOVA
anova(model1, model_quad, model_cubic)
## Analysis of Variance Table
## 
## Model 1: math ~ read + write
## Model 2: math ~ poly(read, 2) + poly(write, 2)
## Model 3: math ~ poly(read, 3) + poly(write, 3)
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1    197 8465.5                              
## 2    195 8158.0  2   307.513 3.6445 0.02795 *
## 3    193 8142.3  2    15.694 0.1860 0.83042  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
knitr::opts_chunk$set(echo = TRUE)
#5.Variance component
#one-way effect model
install.packages("lme4", repos = "https://cloud.r-project.org")
## Installing package into 'C:/Users/Admin/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'lme4' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'lme4'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Admin\AppData\Local\R\win-library\4.5\00LOCK\lme4\libs\x64\lme4.dll to
## C:\Users\Admin\AppData\Local\R\win-library\4.5\lme4\libs\x64\lme4.dll:
## Permission denied
## Warning: restored 'lme4'
## 
## The downloaded binary packages are in
##  C:\Users\Admin\AppData\Local\Temp\Rtmpchgso3\downloaded_packages
library(lme4)
## Warning: package 'lme4' was built under R version 4.5.2
## Loading required package: Matrix
## Registered S3 method overwritten by 'lme4':
##   method           from
##   na.action.merMod car
model_oneway <- lmer(math ~ 1 + (1 | prog), data = hsb2)
summary(model_oneway)
## Linear mixed model fit by REML ['lmerMod']
## Formula: math ~ 1 + (1 | prog)
##    Data: hsb2
## 
## REML criterion at convergence: 1417.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.2499 -0.8043 -0.0785  0.7698  3.4296 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  prog     (Intercept) 26.68    5.165   
##  Residual             68.34    8.267   
## Number of obs: 200, groups:  prog, 3
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   51.110      3.047   16.77
knitr::opts_chunk$set(echo = TRUE)
#two way random effect
model_twoway <- lmer(math ~ 1 + (1 |prog) + (1 +female) , data = hsb2)
summary(model_twoway)
## Linear mixed model fit by REML ['lmerMod']
## Formula: math ~ 1 + (1 | prog) + (1 + female)
##    Data: hsb2
## 
## REML criterion at convergence: 1414.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.2907 -0.8234 -0.1074  0.7625  3.3800 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  prog     (Intercept) 26.72    5.170   
##  Residual             68.57    8.281   
## Number of obs: 200, groups:  prog, 3
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  51.4727     3.1156   16.52
## female       -0.6699     1.1760   -0.57
## 
## Correlation of Fixed Effects:
##        (Intr)
## female -0.205
knitr::opts_chunk$set(echo = TRUE)
#variance components
VarCorr(model_oneway)
##  Groups   Name        Std.Dev.
##  prog     (Intercept) 5.1648  
##  Residual             8.2667
vc_oneway <- as.data.frame(VarCorr(model_oneway))
print(vc_oneway)
##        grp        var1 var2     vcov    sdcor
## 1     prog (Intercept) <NA> 26.67500 5.164785
## 2 Residual        <NA> <NA> 68.33792 8.266675
VarCorr(model_twoway)
##  Groups   Name        Std.Dev.
##  prog     (Intercept) 5.1696  
##  Residual             8.2808
vc_model_twoway <- as.data.frame(VarCorr(model_twoway))
print(vc_model_twoway)
##        grp        var1 var2     vcov    sdcor
## 1     prog (Intercept) <NA> 26.72482 5.169605
## 2 Residual        <NA> <NA> 68.57171 8.280804
knitr::opts_chunk$set(echo = TRUE)
# 6.Generalized linear models
#logistic regression(high vs low performance)
hsb2$performance <- ifelse(hsb2$math >= 50 , 1 , 0)
hsb2$performance <- as.factor(hsb2$performance)
logistic_model <- glm(performance ~ read + write,
                      data = hsb2, 
                      family = binomial)
summary(logistic_model)
## 
## Call:
## glm(formula = performance ~ read + write, family = binomial, 
##     data = hsb2)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.27957    1.38168  -6.716 1.87e-11 ***
## read         0.09261    0.02411   3.842 0.000122 ***
## write        0.09574    0.02362   4.053 5.05e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 269.20  on 199  degrees of freedom
## Residual deviance: 190.15  on 197  degrees of freedom
## AIC: 196.15
## 
## Number of Fisher Scoring iterations: 5
knitr::opts_chunk$set(echo = TRUE)
#Poisson regression 
poisson_model <- glm (science ~ math + read +write,
                      data = hsb2,
                      family = poisson)
summary(poisson_model)
## 
## Call:
## glm(formula = science ~ math + read + write, family = poisson, 
##     data = hsb2)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 3.103691   0.064252  48.305  < 2e-16 ***
## math        0.005878   0.001497   3.926 8.65e-05 ***
## read        0.005710   0.001345   4.246 2.18e-05 ***
## write       0.004319   0.001426   3.029  0.00245 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 386.85  on 199  degrees of freedom
## Residual deviance: 199.50  on 196  degrees of freedom
## AIC: 1361.5
## 
## Number of Fisher Scoring iterations: 4
knitr::opts_chunk$set(echo = TRUE)
#7.Simulation 
#controlled random seed
set.seed(123)

#synthetic data
sim_math <- rnorm(n=nrow(hsb2),
                  mean = mean(hsb2$math),
                  sd = sd(hsb2$math))
synthetic_data <- data.frame(sim_math)
head(synthetic_data)
##   sim_math
## 1 47.39421
## 2 50.48859
## 3 67.24768
## 4 53.30555
## 5 53.85623
## 6 68.71250
summary(hsb2$math)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   33.00   45.00   52.00   52.65   59.00   75.00
summary(sim_math)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   31.01   46.78   52.09   52.56   57.97   83.01
summary(model1)$r.squared
## [1] 0.5153089
knitr::opts_chunk$set(echo = TRUE)
#AIC and BIG and RMSE
AIC(model1)
## [1] 1324.663
BIC(model1)
## [1] 1337.856
pred_lm <- predict(model1, hsb2)
rsme <- sqrt(mean(hsb2$math - pred_lm)^2)
rsme
## [1] 7.24748e-14