knitr::opts_chunk$set(echo = TRUE)
setwd("C:/Users/Admin/Desktop/data set computing")
getwd()
## [1] "C:/Users/Admin/Desktop/data set computing"
hsb2 <- read.csv("hsb2.csv")
# 1.Data overview
#Structure and dimensions
str(hsb2)
## 'data.frame': 200 obs. of 11 variables:
## $ id : int 70 121 86 141 172 113 50 11 84 48 ...
## $ female : int 0 1 0 0 0 0 0 0 0 0 ...
## $ race : int 4 4 4 4 4 4 3 1 4 3 ...
## $ ses : int 1 2 3 3 2 2 2 2 2 2 ...
## $ schtyp : int 1 1 1 1 1 1 1 1 1 1 ...
## $ prog : int 1 3 1 3 2 2 1 2 1 2 ...
## $ read : int 57 68 44 63 47 44 50 34 63 57 ...
## $ write : int 52 59 33 44 52 52 59 46 57 55 ...
## $ math : int 41 53 54 47 57 51 42 45 54 52 ...
## $ science: int 47 63 58 53 53 63 53 39 58 50 ...
## $ socst : int 57 61 31 56 61 61 61 36 51 51 ...
dim(hsb2)
## [1] 200 11
summary(hsb2)
## id female race ses schtyp
## Min. : 1.00 Min. :0.000 Min. :1.00 Min. :1.000 Min. :1.00
## 1st Qu.: 50.75 1st Qu.:0.000 1st Qu.:3.00 1st Qu.:2.000 1st Qu.:1.00
## Median :100.50 Median :1.000 Median :4.00 Median :2.000 Median :1.00
## Mean :100.50 Mean :0.545 Mean :3.43 Mean :2.055 Mean :1.16
## 3rd Qu.:150.25 3rd Qu.:1.000 3rd Qu.:4.00 3rd Qu.:3.000 3rd Qu.:1.00
## Max. :200.00 Max. :1.000 Max. :4.00 Max. :3.000 Max. :2.00
## prog read write math
## Min. :1.000 Min. :28.00 Min. :31.00 Min. :33.00
## 1st Qu.:2.000 1st Qu.:44.00 1st Qu.:45.75 1st Qu.:45.00
## Median :2.000 Median :50.00 Median :54.00 Median :52.00
## Mean :2.025 Mean :52.23 Mean :52.77 Mean :52.65
## 3rd Qu.:2.250 3rd Qu.:60.00 3rd Qu.:60.00 3rd Qu.:59.00
## Max. :3.000 Max. :76.00 Max. :67.00 Max. :75.00
## science socst
## Min. :26.00 Min. :26.00
## 1st Qu.:44.00 1st Qu.:46.00
## Median :53.00 Median :52.00
## Mean :51.85 Mean :52.41
## 3rd Qu.:58.00 3rd Qu.:61.00
## Max. :74.00 Max. :71.00
You can also embed plots, for example:
knitr::opts_chunk$set(echo = TRUE)
# 2.Linear models
#regressin analysis
model1 <-lm( math ~ read + write , data = hsb2)
summary(model1)
##
## Call:
## lm(formula = math ~ read + write, data = hsb2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.8478 -4.6996 0.1016 4.4756 16.0483
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.86507 2.82162 4.559 9.00e-06 ***
## read 0.41695 0.05648 7.382 4.29e-12 ***
## write 0.34112 0.06110 5.583 7.76e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.555 on 197 degrees of freedom
## Multiple R-squared: 0.5153, Adjusted R-squared: 0.5104
## F-statistic: 104.7 on 2 and 197 DF, p-value: < 2.2e-16
knitr::opts_chunk$set(echo = TRUE)
hsb2$prog <- as.factor(hsb2$prog)
anova_model1 <- aov(math ~ prog , data = hsb2)
summary(anova_model1)
## Df Sum Sq Mean Sq F value Pr(>F)
## prog 2 4002 2001.1 29.28 7.36e-12 ***
## Residuals 197 13464 68.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
knitr::opts_chunk$set(echo = TRUE)
#ANCOVA
hsb2$ses <- as.factor(hsb2$ses)
ancova_model <- lm( math ~ prog + ses , data = hsb2)
summary(ancova_model)
##
## Call:
## lm(formula = math ~ prog + ses, data = hsb2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.255 -6.656 -1.079 5.644 28.097
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.781 1.504 31.768 < 2e-16 ***
## prog2 5.865 1.483 3.954 0.000107 ***
## prog3 -3.847 1.687 -2.281 0.023653 *
## ses2 2.968 1.464 2.027 0.043991 *
## ses3 4.608 1.644 2.803 0.005575 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.143 on 195 degrees of freedom
## Multiple R-squared: 0.2597, Adjusted R-squared: 0.2445
## F-statistic: 17.1 on 4 and 195 DF, p-value: 4.893e-12
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
knitr::opts_chunk$set(echo = TRUE)
# 3.Diagnostics
# residual analysis
par(mfrow = c(2,2))
plot(pressure)
plot(model1) # you will have 4 plots
knitr::opts_chunk$set(echo = TRUE)
# transformations for non-normalities
shapiro.test(residuals(model1)) #if it's > 0.05 it's normal
##
## Shapiro-Wilk normality test
##
## data: residuals(model1)
## W = 0.99548, p-value = 0.8174
#transformations
model_log <- lm(log(math) ~read + write , data = hsb2)
summary(model_log)
##
## Call:
## lm(formula = log(math) ~ read + write, data = hsb2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45088 -0.08728 0.00274 0.09001 0.31941
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.191710 0.053994 59.113 < 2e-16 ***
## read 0.007757 0.001081 7.176 1.42e-11 ***
## write 0.006650 0.001169 5.688 4.60e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1254 on 197 degrees of freedom
## Multiple R-squared: 0.511, Adjusted R-squared: 0.5061
## F-statistic: 103 on 2 and 197 DF, p-value: < 2.2e-16
knitr::opts_chunk$set(echo = TRUE)
model_sqrt <- lm(sqrt(math) ~ read + write , data = hsb2)
summary(model_sqrt) # NOTE: the log and sqrt is to reduce skewness you can use any
##
## Call:
## lm(formula = sqrt(math) ~ read + write, data = hsb2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.52687 -0.32656 0.00255 0.30148 1.12995
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.492517 0.194074 23.148 < 2e-16 ***
## read 0.028358 0.003885 7.299 6.96e-12 ***
## write 0.023753 0.004202 5.652 5.50e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4509 on 197 degrees of freedom
## Multiple R-squared: 0.5146, Adjusted R-squared: 0.5097
## F-statistic: 104.4 on 2 and 197 DF, p-value: < 2.2e-16
knitr::opts_chunk$set(echo = TRUE)
# collinearity
#correlation
cor(hsb2[c("read" , "write" , "math")])
## read write math
## read 1.0000000 0.5967765 0.6622801
## write 0.5967765 1.0000000 0.6174493
## math 0.6622801 0.6174493 1.0000000
install.packages("car", repos = "https://cloud.r-project.org")
## Installing package into 'C:/Users/Admin/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'car' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Admin\AppData\Local\Temp\Rtmpchgso3\downloaded_packages
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.2
model_log <- lm(math ~ read + write, data = hsb2)
#VIF(variance inflation factor)
vif(model_log)
## read write
## 1.553138 1.553138
knitr::opts_chunk$set(echo = TRUE)
# 4.Non-linear model
#polynomial regression
#quadratic model
model_quad <- lm(math ~ poly(read , 2) + poly(write, 2), data = hsb2)
summary(model_quad)
##
## Call:
## lm(formula = math ~ poly(read, 2) + poly(write, 2), data = hsb2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.2606 -4.5697 -0.5615 4.7927 16.6931
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.6450 0.4574 115.106 < 2e-16 ***
## poly(read, 2)1 56.8236 8.1626 6.961 5.01e-11 ***
## poly(read, 2)2 2.2050 6.5841 0.335 0.7381
## poly(write, 2)1 47.8416 8.1167 5.894 1.63e-08 ***
## poly(write, 2)2 17.2451 6.6407 2.597 0.0101 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.468 on 195 degrees of freedom
## Multiple R-squared: 0.5329, Adjusted R-squared: 0.5233
## F-statistic: 55.62 on 4 and 195 DF, p-value: < 2.2e-16
knitr::opts_chunk$set(echo = TRUE)
model_cubic <- lm(math ~ poly(read, 3) + poly(write, 3), data = hsb2)
summary(model_cubic)
##
## Call:
## lm(formula = math ~ poly(read, 3) + poly(write, 3), data = hsb2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.3843 -4.5223 -0.7039 4.6521 16.4599
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.6450 0.4593 114.624 < 2e-16 ***
## poly(read, 3)1 57.0523 8.2090 6.950 5.48e-11 ***
## poly(read, 3)2 2.4778 6.6338 0.374 0.7092
## poly(read, 3)3 -2.2284 6.5179 -0.342 0.7328
## poly(write, 3)1 47.5904 8.1758 5.821 2.40e-08 ***
## poly(write, 3)2 17.1033 6.6741 2.563 0.0111 *
## poly(write, 3)3 -3.2977 6.5187 -0.506 0.6135
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.495 on 193 degrees of freedom
## Multiple R-squared: 0.5338, Adjusted R-squared: 0.5193
## F-statistic: 36.83 on 6 and 193 DF, p-value: < 2.2e-16
knitr::opts_chunk$set(echo = TRUE)
#model comparison ANOVA
anova(model1, model_quad, model_cubic)
## Analysis of Variance Table
##
## Model 1: math ~ read + write
## Model 2: math ~ poly(read, 2) + poly(write, 2)
## Model 3: math ~ poly(read, 3) + poly(write, 3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 197 8465.5
## 2 195 8158.0 2 307.513 3.6445 0.02795 *
## 3 193 8142.3 2 15.694 0.1860 0.83042
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
knitr::opts_chunk$set(echo = TRUE)
#5.Variance component
#one-way effect model
install.packages("lme4", repos = "https://cloud.r-project.org")
## Installing package into 'C:/Users/Admin/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'lme4' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'lme4'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Admin\AppData\Local\R\win-library\4.5\00LOCK\lme4\libs\x64\lme4.dll to
## C:\Users\Admin\AppData\Local\R\win-library\4.5\lme4\libs\x64\lme4.dll:
## Permission denied
## Warning: restored 'lme4'
##
## The downloaded binary packages are in
## C:\Users\Admin\AppData\Local\Temp\Rtmpchgso3\downloaded_packages
library(lme4)
## Warning: package 'lme4' was built under R version 4.5.2
## Loading required package: Matrix
## Registered S3 method overwritten by 'lme4':
## method from
## na.action.merMod car
model_oneway <- lmer(math ~ 1 + (1 | prog), data = hsb2)
summary(model_oneway)
## Linear mixed model fit by REML ['lmerMod']
## Formula: math ~ 1 + (1 | prog)
## Data: hsb2
##
## REML criterion at convergence: 1417.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.2499 -0.8043 -0.0785 0.7698 3.4296
##
## Random effects:
## Groups Name Variance Std.Dev.
## prog (Intercept) 26.68 5.165
## Residual 68.34 8.267
## Number of obs: 200, groups: prog, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 51.110 3.047 16.77
knitr::opts_chunk$set(echo = TRUE)
#two way random effect
model_twoway <- lmer(math ~ 1 + (1 |prog) + (1 +female) , data = hsb2)
summary(model_twoway)
## Linear mixed model fit by REML ['lmerMod']
## Formula: math ~ 1 + (1 | prog) + (1 + female)
## Data: hsb2
##
## REML criterion at convergence: 1414.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.2907 -0.8234 -0.1074 0.7625 3.3800
##
## Random effects:
## Groups Name Variance Std.Dev.
## prog (Intercept) 26.72 5.170
## Residual 68.57 8.281
## Number of obs: 200, groups: prog, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 51.4727 3.1156 16.52
## female -0.6699 1.1760 -0.57
##
## Correlation of Fixed Effects:
## (Intr)
## female -0.205
knitr::opts_chunk$set(echo = TRUE)
#variance components
VarCorr(model_oneway)
## Groups Name Std.Dev.
## prog (Intercept) 5.1648
## Residual 8.2667
vc_oneway <- as.data.frame(VarCorr(model_oneway))
print(vc_oneway)
## grp var1 var2 vcov sdcor
## 1 prog (Intercept) <NA> 26.67500 5.164785
## 2 Residual <NA> <NA> 68.33792 8.266675
VarCorr(model_twoway)
## Groups Name Std.Dev.
## prog (Intercept) 5.1696
## Residual 8.2808
vc_model_twoway <- as.data.frame(VarCorr(model_twoway))
print(vc_model_twoway)
## grp var1 var2 vcov sdcor
## 1 prog (Intercept) <NA> 26.72482 5.169605
## 2 Residual <NA> <NA> 68.57171 8.280804
knitr::opts_chunk$set(echo = TRUE)
# 6.Generalized linear models
#logistic regression(high vs low performance)
hsb2$performance <- ifelse(hsb2$math >= 50 , 1 , 0)
hsb2$performance <- as.factor(hsb2$performance)
logistic_model <- glm(performance ~ read + write,
data = hsb2,
family = binomial)
summary(logistic_model)
##
## Call:
## glm(formula = performance ~ read + write, family = binomial,
## data = hsb2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.27957 1.38168 -6.716 1.87e-11 ***
## read 0.09261 0.02411 3.842 0.000122 ***
## write 0.09574 0.02362 4.053 5.05e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 269.20 on 199 degrees of freedom
## Residual deviance: 190.15 on 197 degrees of freedom
## AIC: 196.15
##
## Number of Fisher Scoring iterations: 5
knitr::opts_chunk$set(echo = TRUE)
#Poisson regression
poisson_model <- glm (science ~ math + read +write,
data = hsb2,
family = poisson)
summary(poisson_model)
##
## Call:
## glm(formula = science ~ math + read + write, family = poisson,
## data = hsb2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.103691 0.064252 48.305 < 2e-16 ***
## math 0.005878 0.001497 3.926 8.65e-05 ***
## read 0.005710 0.001345 4.246 2.18e-05 ***
## write 0.004319 0.001426 3.029 0.00245 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 386.85 on 199 degrees of freedom
## Residual deviance: 199.50 on 196 degrees of freedom
## AIC: 1361.5
##
## Number of Fisher Scoring iterations: 4
knitr::opts_chunk$set(echo = TRUE)
#7.Simulation
#controlled random seed
set.seed(123)
#synthetic data
sim_math <- rnorm(n=nrow(hsb2),
mean = mean(hsb2$math),
sd = sd(hsb2$math))
synthetic_data <- data.frame(sim_math)
head(synthetic_data)
## sim_math
## 1 47.39421
## 2 50.48859
## 3 67.24768
## 4 53.30555
## 5 53.85623
## 6 68.71250
summary(hsb2$math)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.00 45.00 52.00 52.65 59.00 75.00
summary(sim_math)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 31.01 46.78 52.09 52.56 57.97 83.01
summary(model1)$r.squared
## [1] 0.5153089
knitr::opts_chunk$set(echo = TRUE)
#AIC and BIG and RMSE
AIC(model1)
## [1] 1324.663
BIC(model1)
## [1] 1337.856
pred_lm <- predict(model1, hsb2)
rsme <- sqrt(mean(hsb2$math - pred_lm)^2)
rsme
## [1] 7.24748e-14