knitr::opts_chunk$set(echo = TRUE)
setwd("C:/Users/Admin/Desktop/data set computing")
getwd()
## [1] "C:/Users/Admin/Desktop/data set computing"
hsb2 <- read.csv("hsb2.csv")
names(hsb2)[names(hsb2) == "female"] <- "gender"
names(hsb2)
## [1] "id" "gender" "race" "ses" "schtyp" "prog" "read"
## [8] "write" "math" "science" "socst"
knitr::opts_chunk$set(echo = TRUE)
# 1.Structure and dimensions
# structure
options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("Stat2Data")
## Installing package into 'C:/Users/Admin/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'Stat2Data' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Admin\AppData\Local\Temp\RtmpEtGqpQ\downloaded_packages
library(Stat2Data)
## Warning: package 'Stat2Data' was built under R version 4.5.2
names(hsb2)
## [1] "id" "gender" "race" "ses" "schtyp" "prog" "read"
## [8] "write" "math" "science" "socst"
dim(hsb2)
## [1] 200 11
summary(hsb2[, c("math", "science", "read", "write", "socst")])
## math science read write
## Min. :33.00 Min. :26.00 Min. :28.00 Min. :31.00
## 1st Qu.:45.00 1st Qu.:44.00 1st Qu.:44.00 1st Qu.:45.75
## Median :52.00 Median :53.00 Median :50.00 Median :54.00
## Mean :52.65 Mean :51.85 Mean :52.23 Mean :52.77
## 3rd Qu.:59.00 3rd Qu.:58.00 3rd Qu.:60.00 3rd Qu.:60.00
## Max. :75.00 Max. :74.00 Max. :76.00 Max. :67.00
## socst
## Min. :26.00
## 1st Qu.:46.00
## Median :52.00
## Mean :52.41
## 3rd Qu.:61.00
## Max. :71.00
You can also embed plots, for example:
## Installing package into 'C:/Users/Admin/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Admin\AppData\Local\Temp\RtmpEtGqpQ\downloaded_packages
## Warning: package 'ggplot2' was built under R version 4.5.3
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
plot(# read by gender
ggplot(hsb2, aes(gender, read)) +
geom_boxplot())
## Warning: Orientation is not uniquely specified when both the x and y aesthetics are
## continuous. Picking default orientation 'x'.
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
knitr::opts_chunk$set(echo = TRUE)
#2. Math by gender (OK: 2 groups)
t.test(math ~ gender, data = hsb2)
##
## Welch Two Sample t-test
##
## data: math by gender
## t = 0.41097, df = 187.58, p-value = 0.6816
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -2.092206 3.193325
## sample estimates:
## mean in group 0 mean in group 1
## 52.94505 52.39450
# Program proportions (OK: categorical)
table(hsb2$prog)
##
## 1 2 3
## 45 105 50
chisq.test(table(hsb2$prog))
##
## Chi-squared test for given probabilities
##
## data: table(hsb2$prog)
## X-squared = 33.25, df = 2, p-value = 6.024e-08
summary(# Science across SES (3 groups → ANOVA, NOT var.test)
summary(aov(science ~ ses, data = hsb2))
)
## Length Class Mode
## [1,] 5 anova list
knitr::opts_chunk$set(echo = TRUE)
#3.
# split read into high/low
hsb2$high_read <- ifelse(hsb2$read > mean(hsb2$read), 1, 0)
t.test(math ~ high_read, data = hsb2)
##
## Welch Two Sample t-test
##
## data: math by high_read
## t = -9.5731, df = 161.53, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -13.060545 -8.593675
## sample estimates:
## mean in group 0 mean in group 1
## 48.04348 58.87059
knitr::opts_chunk$set(echo = TRUE)
#4.
# t-test: school type
t.test(math ~ schtyp, data = hsb2)
##
## Welch Two Sample t-test
##
## data: math by schtyp
## t = -1.448, df = 45.351, p-value = 0.1545
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
## -5.9908645 0.9789598
## sample estimates:
## mean in group 1 mean in group 2
## 52.24405 54.75000
# ANOVA: SES effect on science
aov(science ~ ses, data = hsb2)
## Call:
## aov(formula = science ~ ses, data = hsb2)
##
## Terms:
## ses Residuals
## Sum of Squares 1560.739 17946.761
## Deg. of Freedom 1 198
##
## Residual standard error: 9.520515
## Estimated effects may be unbalanced
# correlation
cor.test(hsb2$read, hsb2$write)
##
## Pearson's product-moment correlation
##
## data: hsb2$read and hsb2$write
## t = 10.465, df = 198, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4993831 0.6792753
## sample estimates:
## cor
## 0.5967765
knitr::opts_chunk$set(echo = TRUE)
#.5.
model <- lm(math ~ read, data = hsb2)
summary(model)
##
## Call:
## lm(formula = math ~ read, data = hsb2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.1624 -5.1624 -0.4135 4.7775 16.4684
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.03816 2.58945 8.125 4.76e-14 ***
## read 0.60515 0.04865 12.438 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.037 on 198 degrees of freedom
## Multiple R-squared: 0.4386, Adjusted R-squared: 0.4358
## F-statistic: 154.7 on 1 and 198 DF, p-value: < 2.2e-16
knitr::opts_chunk$set(echo = TRUE)
model0 <- lm(math ~ 1, data = hsb2)
anova(model0, model)
## Analysis of Variance Table
##
## Model 1: math ~ 1
## Model 2: math ~ read
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 199 17466
## 2 198 9805 1 7660.8 154.7 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
knitr::opts_chunk$set(echo = TRUE)
#6.
# CI for math mean
t.test(hsb2$math)
##
## One Sample t-test
##
## data: hsb2$math
## t = 79.47, df = 199, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 51.33868 53.95132
## sample estimates:
## mean of x
## 52.645
# mean read & write
mean(hsb2$read)
## [1] 52.23
mean(hsb2$write)
## [1] 52.775
# covariance (joint idea)
cov(hsb2[, c("read","write")])
## read write
## read 105.12271 57.99673
## write 57.99673 89.84359
summary(#7.
# math ~ read relationship
lm(math ~ read, data = hsb2))
##
## Call:
## lm(formula = math ~ read, data = hsb2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.1624 -5.1624 -0.4135 4.7775 16.4684
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.03816 2.58945 8.125 4.76e-14 ***
## read 0.60515 0.04865 12.438 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.037 on 198 degrees of freedom
## Multiple R-squared: 0.4386, Adjusted R-squared: 0.4358
## F-statistic: 154.7 on 1 and 198 DF, p-value: < 2.2e-16
knitr::opts_chunk$set(echo = TRUE)
# gender difference in write
t.test(write ~ gender, data = hsb2)
##
## Welch Two Sample t-test
##
## data: write by gender
## t = -3.6564, df = 169.71, p-value = 0.0003409
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -7.499159 -2.240734
## sample estimates:
## mean in group 0 mean in group 1
## 50.12088 54.99083
summary(aov(science ~ ses, data = hsb2))
## Df Sum Sq Mean Sq F value Pr(>F)
## ses 1 1561 1560.7 17.22 4.94e-05 ***
## Residuals 198 17947 90.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
knitr::opts_chunk$set(echo = TRUE)
#.8
set.seed(123)
# Type I error
B <- 500
p <- replicate(B, {
x <- rnorm(30, 50, 10)
t.test(x, mu=50)$p.value
})
mean(p < 0.05)
## [1] 0.034
# Type II error
p2 <- replicate(B, {
x <- rnorm(30, 55, 10)
t.test(x, mu=50)$p.value
})
mean(p2 >= 0.05)
## [1] 0.286
knitr::opts_chunk$set(echo = TRUE)
#9
options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("pwr")
## Installing package into 'C:/Users/Admin/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'pwr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Admin\AppData\Local\Temp\RtmpEtGqpQ\downloaded_packages
library(pwr)
## Warning: package 'pwr' was built under R version 4.5.3
pwr.t.test(d=0.5, n=100, sig.level=0.05, type="two.sample")
##
## Two-sample t test power calculation
##
## n = 100
## d = 0.5
## sig.level = 0.05
## power = 0.9404272
## alternative = two.sided
##
## NOTE: n is number in *each* group
# interpretation
test <- t.test(math ~ gender, data = hsb2)
if(test$p.value < 0.05){
"Reject H0: significant difference"
} else {
"Fail to reject H0"
}
## [1] "Fail to reject H0"