df_tech <- read_csv("technology.csv")
## Rows: 56 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (5): gender, pretest, posttest, compsci, teacher
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
H0: \(\mu\)post - \(\mu\)pre = 0
Ha: \(\mu\)post -
\(\mu\)pre \(\neq\) 0
alpha = 0.05
t_test <- t.test(df_tech$posttest, df_tech$pretest, paired = TRUE)
mean_pre <- mean(df_tech$pretest)
mean_post <- mean(df_tech$posttest)
t_value <- t_test$statistic
p_value <- t_test$p.value
df_value <- t_test$parameter
cat(sprintf("Pretest mean = %.2f\nPosttest mean = %.2f\nt-statistic = %.2f\np-value = %.4f\ndf_value = %.0f\n\nBecause the p-value < %.2f, we reject the null hypothesis. We can conclude that the technology curriculum produced a significant improvement in computer skills among the students.",
mean_pre,
mean_post,
t_value,
p_value,
df_value,
alpha
))
## Pretest mean = 64.34
## Posttest mean = 67.12
## t-statistic = 3.09
## p-value = 0.0031
## df_value = 55
##
## Because the p-value < 0.05, we reject the null hypothesis. We can conclude that the technology curriculum produced a significant improvement in computer skills among the students.
df_students <- read.csv("students.csv")
head(df_students)
## sex gpa tvhrswk read math
## 1 2 4.0 2 64.81 64.13
## 2 1 3.0 2 50.74 43.49
## 3 1 2.3 2 53.44 51.80
## 4 2 2.4 2 57.51 51.85
## 5 2 1.6 2 46.92 44.94
## 6 1 2.5 1 38.80 38.55
Reading Exam
H0: \(\mu\)group1
= \(\mu\)group2
Ha: \(\mu\)group1
\(\neq\) \(\mu\)group2
Math Exam
H0: \(\mu\)group1
= \(\mu\)group2
Ha: \(\mu\)group1
\(\neq\) \(\mu\)group2
group1_read <- df_students %>%
filter(tvhrswk < 2) %>%
pull(read)
group2_read <- df_students %>%
filter(tvhrswk >= 2) %>%
pull(read)
group1_math <- df_students %>%
filter(tvhrswk < 2) %>%
pull(math)
group2_math <- df_students %>%
filter(tvhrswk >= 2) %>%
pull(math)
alpha = 0.05
t_test_result <- t.test(group1_read, group2_read, var.equal = FALSE)
cat(sprintf("Group1_read mean = %.2f\nGroup2_read mean = %.2f\nt-statistic = %.2f\np-value = %.4f\n\nSince p-value < 0.05, we reject the null hypothesis and conclude that there is significant difference between the reading scores of students who watched less than two hours of television per weekday and the group of students who watched two or more hours of television per weekday.",
mean(group1_read),
mean(group2_read),
t_test_result$statistic,
t_test_result$p.value
))
## Group1_read mean = 54.31
## Group2_read mean = 46.97
## t-statistic = 3.21
## p-value = 0.0020
##
## Since p-value < 0.05, we reject the null hypothesis and conclude that there is significant difference between the reading scores of students who watched less than two hours of television per weekday and the group of students who watched two or more hours of television per weekday.
t_test_result <- t.test(group1_math, group2_math, var.equal = FALSE)
cat(sprintf("Group1_math mean = %.2f\nGroup2_math mean = %.2f\nt-statistic = %.2f\np-value = %.4f\n\nSince p-value < 0.05, we reject the null hypothesis and conclude that there is significant difference between the math scores of students who watched less than two hours of television per weekday and the group of students who watched two or more hours of television per weekday.",
mean(group1_math),
mean(group2_math),
t_test_result$statistic,
t_test_result$p.value
))
## Group1_math mean = 54.91
## Group2_math mean = 46.04
## t-statistic = 3.95
## p-value = 0.0002
##
## Since p-value < 0.05, we reject the null hypothesis and conclude that there is significant difference between the math scores of students who watched less than two hours of television per weekday and the group of students who watched two or more hours of television per weekday.
df_sph <- read_csv("sophomores.csv")
## Rows: 521 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): gender
## dbl (6): id, gr8math, catmt, math, read, cgpa
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(ggplot2)
ggplot(df_sph, aes(x = read, y = math)) +
geom_point(color = "steelblue", size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(
title = "Scatterplot of READ vs MATH",
x = "READ Score",
y = "MATH Score"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
The scatterplot shows a positive linear relationship between read and math. As read scores increase, math scores also tend to increase. The points appear moderately close to a straight line, suggesting a moderate correlation. Overall, there are no extreme outliers.
cor(df_sph$read, df_sph$math)
## [1] 0.5223075
The correlation coefficient between math and read is r ≈ 0.52. This indicates a moderately strong positive relationship. This aligns with the description of the scatterplot: students with higher reading scores tend to have higher math scores.
cor_math_gpa <- cor(df_sph$math, df_sph$cgpa)
cor_read_gpa <- cor(df_sph$read, df_sph$cgpa)
cat(sprintf("The correlation coefficient between math and cgpa is %.4f\nThe correlation coefficient between reading score and cgpa is %.4f",
cor_math_gpa,
cor_read_gpa
))
## The correlation coefficient between math and cgpa is 0.6195
## The correlation coefficient between reading score and cgpa is 0.5476
The relationship between math and cgpa is stronger than that between read and GPA, because 0.62 > 0.55. This suggests math performance is more predictive of cgpa than read performance.
model <- lm(read ~ cgpa, data = df_sph)
summary(model)
##
## Call:
## lm(formula = read ~ cgpa, data = df_sph)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.556 -6.791 0.365 7.326 48.797
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 222.6741 1.8390 121.08 <2e-16 ***
## cgpa 9.8038 0.6576 14.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.66 on 519 degrees of freedom
## Multiple R-squared: 0.2998, Adjusted R-squared: 0.2985
## F-statistic: 222.2 on 1 and 519 DF, p-value: < 2.2e-16
Based on the output of the linear regression model, the equation that will allow us to predict read score given a cgpa is read score = 9.8038(cgpa) + 222.6741
Based on the regression model output, approximately 30% of the variance in read score is explained by the students’ cgpa.
predicted_read = coef(model)[2] * 3.00 + coef(model)[1]
unname(predicted_read)
## [1] 252.0855
For a cgpa of 3.00, the predicted reading score for a student is approximately 252.