sd(dataschools2$GR3_5_mathlevel) #18.9836
## [1] 18.9836
dataschools2$GR3_5_mathlevel<-as.numeric(dataschools2$GR3_5_mathlevel)
mean(dataschools2$GR3_5_mathlevel) #37.88838
## [1] 37.88838
min(dataschools2$GR3_5_mathlevel) #0
## [1] 0
max(dataschools2$GR3_5_mathlevel) #100
## [1] 100
median(dataschools2$GR3_5_mathlevel) #35.4
## [1] 35.4
math3_5_mean<-mean(dataschools2$GR3_5_mathlevel)
math3_5_median<-median(dataschools2$GR3_5_mathlevel)
ggplot((data = dataschools2), aes(x = GR3_5_mathlevel)) +
geom_histogram(binwidth = 5, fill = "lightblue", color = "black") +
geom_vline(xintercept = math3_5_mean, color = "red", linetype = "dashed", size = 1) +
geom_vline(xintercept = math3_5_median, color = "blue", linetype = "dashed", size = 1) +
annotate("text", x = math3_5_mean - 0.5, y = 10, label = "mean", color = "red", vjust = -0.5) +
annotate("text", x = math3_5_median + 0.5, y = 10, label = "median", color = "blue", vjust = -0.5) +
labs(title = "Histogram of At Level Gr.3-5 Math with Mean and Median Lines",
x = "At Math Grade Level (Gr. 3-5) Percent",
y = "Frequency")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## 1c. Given your responses to 1a and 1b, can you state that the
variable is normally distributed? What information did you use to
conclude that?
dataschools2$GR3_5_mathlevel<-as.numeric(dataschools2$GR3_5_mathlevel)
x <- dataschools2$GR3_5_mathlevel
x_mean <- mean(dataschools2$GR3_5_mathlevel)
x_sd <- sd(dataschools2$GR3_5_mathlevel)
Zi <- (x - x_mean) / x_sd
Zi_table <- seq(-2, 2, by = 0.5)
mass_belowzi <- sapply(Zi_table, function(z) mean(Zi <= z))
answers <- data.frame(Zi_table, mass_belowzi)
Zi_snd <- seq(-2, 2, by=0.5)
mass_below_Zisnd <- pnorm(Zi_snd)
table_mass_below_Zisnd <- data.frame(Zi_snd, mass_below_Zisnd)
colnames(dataschools2)[6] <- "misconduct_rate"
sd(dataschools2$misconduct_rate) #28.35078
## [1] 28.35078
mean(dataschools2$misconduct_rate) #23.16392
## [1] 23.16392
min(dataschools2$misconduct_rate) #0
## [1] 0
max(dataschools2$misconduct_rate) #230.6
## [1] 230.6
median(dataschools2$misconduct_rate) #13.4
## [1] 13.4
misconduct_mean<-mean(dataschools2$misconduct_rate)
misconduct_median<-median(dataschools2$misconduct_rate)
ggplot((data = dataschools2), aes(x = misconduct_rate)) +
geom_histogram(binwidth = 5, fill = "lightblue", color = "black") +
geom_vline(xintercept = misconduct_mean, color = "red", linetype = "dashed", size = 1) +
geom_vline(xintercept = misconduct_median, color = "blue", linetype = "dashed", size = 1) +
annotate("text", x = misconduct_mean - 0.5, y = 10, label = "mean", color = "red", vjust = -0.5) +
annotate("text", x = misconduct_median + 0.5, y = 10, label = "median", color = "blue", vjust = -0.5) +
labs(title = "Histogram of Number of Misconducts per 100 Students with Mean and Median Lines",
x = "Number of Misconducts per 100 students",
y = "Frequency")
## 2c. Given your responses to 2a and 2b, can you state that the
variable is normally distributed? What information did you use to
conclude that?
x2 <- dataschools2$misconduct_rate
x2_mean <- mean(dataschools2$misconduct_rate)
x2_sd <- sd(dataschools2$misconduct_rate)
Zi2 <- (x2 - x2_mean) / x2_sd
Zi2_table <- seq(-2, 2, by=0.5)
mass_belowzi2 <- sapply(Zi_table, function(z) mean(Zi2 <= z))
answers2 <- data.frame(Zi2_table, mass_belowzi2)
Zi_snd <- seq(-2, 2, by=0.5)
mass_below_Zisnd <- pnorm(Zi_snd)
table_mass_below_Zisnd <- data.frame(Zi_snd, mass_below_Zisnd)
colnames(dataschools2)[5] <- "avg_student_attendance"
mean(dataschools2$avg_student_attendance) #94.23777
## [1] 94.23777
sd(dataschools2$avg_student_attendance) #2.150204
## [1] 2.150204
length(dataschools2$avg_student_attendance)
## [1] 413
(94.23777-95)/(2.150204/(sqrt(413))) # t= -7.204128
## [1] -7.204128
qt(0.975, 412, ncp =0, lower.tail = TRUE, log.p = FALSE) #1.965739
## [1] 1.965739
t3<- (94.23777-95)/(2.150204/(sqrt(413))) #-7.204128
pval_q3<-pnorm(t3) #2.920824e-13
t.test(dataschools2$avg_student_attendance, mu = 95)
##
## One Sample t-test
##
## data: dataschools2$avg_student_attendance
## t = -7.2041, df = 412, p-value = 2.802e-12
## alternative hypothesis: true mean is not equal to 95
## 95 percent confidence interval:
## 94.02979 94.44576
## sample estimates:
## mean of x
## 94.23777
colnames(dataschools2)[3] <- "safety_score"
colnames(dataschools2)[2] <- "health_cert"
# YES
healthy_mean<-mean(dataschools2$safety_score[dataschools2$health_cert=="Yes"]) #62.09091
sd(dataschools2$safety_score[dataschools2$health_cert=="Yes"]) #17.67741
## [1] 17.67741
summary(dataschools2$safety_score[dataschools2$health_cert=="Yes"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 44.00 53.50 57.00 62.09 62.00 99.00
sum(dataschools2$health_cert=="Yes") #11
## [1] 11
# NO
no_healthy_mean<-mean(dataschools2$safety_score[dataschools2$health_cert=="No"]) #49.301
sd(dataschools2$safety_score[dataschools2$health_cert=="No"]) #20.37985
## [1] 20.37985
summary(dataschools2$safety_score[dataschools2$health_cert=="No"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 34.0 47.0 49.3 62.0 99.0
sum(dataschools2$health_cert=="No") #402
## [1] 402
# Yes healthy cert=X No healthy cert=Y
#(Xn - Yn) / sqrt(((S^2/Nx)+(s2y/Ny)))
(62.09091 - 49.301) / sqrt((((17.67741^2)/11)+((20.37985^2)/402)))
## [1] 2.357154
#t= 2.357154
t95 <- abs(qnorm(0.025))
t4<-2.357154
2 * (1-pnorm(t4)) #0.01841561
## [1] 0.01841561
t.test(dataschools2$safety_score[dataschools2$health_cert=="Yes"], dataschools2$safety_score[dataschools2$health_cert=="No"])
##
## Welch Two Sample t-test
##
## data: dataschools2$safety_score[dataschools2$health_cert == "Yes"] and dataschools2$safety_score[dataschools2$health_cert == "No"]
## t = 2.3572, df = 10.74, p-value = 0.03852
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.812042 24.767786
## sample estimates:
## mean of x mean of y
## 62.09091 49.30100
ggplot(data = dataschools2, mapping = aes(x = avg_student_attendance, y= misconduct_rate)) +
geom_point()
### Just from the scatter plot, I’m not sure these variables have a
strong relationship, but it does look like the higher the average
student attendance, the lower the rate of misconducts are. I think
adding a line would help to see this relationship better.
ggplot(data = dataschools2, mapping = aes(x = avg_student_attendance, y= misconduct_rate)) +
geom_point() +
geom_smooth(method = "lm", formula = y~x)
### The linear fit line makes me think that the data is more strongly
correlated.
lm(misconduct_rate ~ avg_student_attendance, data=dataschools2)
##
## Call:
## lm(formula = misconduct_rate ~ avg_student_attendance, data = dataschools2)
##
## Coefficients:
## (Intercept) avg_student_attendance
## 696.94 -7.15
summary(lm(misconduct_rate ~ avg_student_attendance, data=dataschools2))
##
## Call:
## lm(formula = misconduct_rate ~ avg_student_attendance, data = dataschools2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.230 -11.489 -4.375 5.052 193.582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 696.9357 51.5101 13.53 <2e-16 ***
## avg_student_attendance -7.1497 0.5465 -13.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.85 on 411 degrees of freedom
## Multiple R-squared: 0.294, Adjusted R-squared: 0.2923
## F-statistic: 171.2 on 1 and 411 DF, p-value: < 2.2e-16
summary(lm(misconduct_rate ~ avg_student_attendance + safety_score, data=dataschools2))
##
## Call:
## lm(formula = misconduct_rate ~ avg_student_attendance + safety_score,
## data = dataschools2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.919 -11.445 -3.907 4.975 192.104
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 601.44679 60.26407 9.980 < 2e-16 ***
## avg_student_attendance -6.02736 0.65959 -9.138 < 2e-16 ***
## safety_score -0.20704 0.06953 -2.978 0.00308 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.62 on 410 degrees of freedom
## Multiple R-squared: 0.309, Adjusted R-squared: 0.3056
## F-statistic: 91.66 on 2 and 410 DF, p-value: < 2.2e-16