#Zhivko Kolevski T7
#ANOVA LAB
#Motivating Example
scores <- read.csv("ACT Sample Scores.csv")
str(scores)
## 'data.frame': 8 obs. of 4 variables:
## $ English: int 28 26 24 29 30 24 25 26
## $ French : int 33 29 31 31 30 29 28 32
## $ German : int 33 31 29 30 32 29 28 34
## $ Spanish: int 31 29 26 30 24 29 34 33
summary(scores)
## English French German Spanish
## Min. :24.00 Min. :28.00 Min. :28.00 Min. :24.00
## 1st Qu.:24.75 1st Qu.:29.00 1st Qu.:29.00 1st Qu.:28.25
## Median :26.00 Median :30.50 Median :30.50 Median :29.50
## Mean :26.50 Mean :30.38 Mean :30.75 Mean :29.50
## 3rd Qu.:28.25 3rd Qu.:31.25 3rd Qu.:32.25 3rd Qu.:31.50
## Max. :30.00 Max. :33.00 Max. :34.00 Max. :34.00
scores
## English French German Spanish
## 1 28 33 33 31
## 2 26 29 31 29
## 3 24 31 29 26
## 4 29 31 30 30
## 5 30 30 32 24
## 6 24 29 29 29
## 7 25 28 28 34
## 8 26 32 34 33
# Stack the data
scores.stacked = stack(scores)
# View data structure
str(scores.stacked)
## 'data.frame': 32 obs. of 2 variables:
## $ values: int 28 26 24 29 30 24 25 26 33 29 ...
## $ ind : Factor w/ 4 levels "English","French",..: 1 1 1 1 1 1 1 1 2 2 ...
# View data summary
summary(scores.stacked)
## values ind
## Min. :24.00 English:8
## 1st Qu.:28.00 French :8
## Median :29.00 German :8
## Mean :29.28 Spanish:8
## 3rd Qu.:31.00
## Max. :34.00
# View first 10 rows of data
head(scores.stacked, 10)
## values ind
## 1 28 English
## 2 26 English
## 3 24 English
## 4 29 English
## 5 30 English
## 6 24 English
## 7 25 English
## 8 26 English
## 9 33 French
## 10 29 French
# Chance column names
colnames(scores.stacked) = c("act_score", "language")
# View first ten rows of data
head(scores.stacked, 10)
## act_score language
## 1 28 English
## 2 26 English
## 3 24 English
## 4 29 English
## 5 30 English
## 6 24 English
## 7 25 English
## 8 26 English
## 9 33 French
## 10 29 French
#Vizualize the data
# Baseline boxplot
boxplot(act_score~language,
data = scores.stacked,
main = "Test Scores by Language",
xlab = "Language",
ylab = "ACT Score",
ylim = c(24, 36), # Increase y limit to make room for legend
col = "light blue")
# Calculate the means per language
means <- aggregate(act_score ~ language, scores.stacked, mean)
# Plot means as points on boxplot
points(means, col="black", pch=18)
# Plot horizontal line for overall mean
abline(h = mean(scores.stacked$act_score),
col = "red",
lwd = 2, # change line width
lty = 2) # change line type (dashed line)
# Add legend
legend("topleft",
legend = c("Within-language mean", "Overall mean"),
pch = c(18, NA),
lty = c(NA, 2),
col = c("black", "red"))

#PHASE 1
#prepare
#H0: μE=μF=μG=μS
#Ha: not H0
#check
#1.Are the observations independent within and across groups?
#Based on the description of the data, we do not have reason to believe that information about one observation would give us additional information about an observation within the same group or in a different group. We may assume independence.
#2.Are the data within each group are nearly normal? To test this observation, we should view histograms and Normal Q-Q plots. This observation only needs to be loosely held. So long as the data appear unimodal and mound shaped, we won’t have any serious reasons for concern.
# Create subsets to make plotting easier
scores.e = subset(scores.stacked, scores.stacked$language == "English")
scores.f = subset(scores.stacked, scores.stacked$language == "French")
scores.g = subset(scores.stacked, scores.stacked$language == "German")
scores.s = subset(scores.stacked, scores.stacked$language == "Spanish")
#Set 2x2 plot window with increased upper margin for title
par(mfrow = c(2,2), oma = c(0,0,2,0))
#Plot histograms
hist(scores.e$act_score,
main = "",
xlab = "English scores",
col = "light blue")
hist(scores.f$act_score,
main = "",
xlab = "French scores",
col = "light blue")
hist(scores.g$act_score,
main = "",
xlab = "German scores",
col = "light blue")
hist(scores.s$act_score,
main = "",
xlab = "Spanish scores",
col = "light blue")
# Add overall title
title("Histograms of ACT scores by language", outer = TRUE, cex = 1.5)

#Set 2x2 plot window with increased upper margin for title
par(mfrow = c(2,2), oma = c(0,0,2,0))
#Plot q-q plots
#English
qqnorm(scores.e$act_score,
main = "",
ylab = "English scores")
qqline(scores.e$act_score, col = "red")
# French
qqnorm(scores.f$act_score,
main = "",
ylab = "French scores")
qqline(scores.f$act_score, col = "red")
# German
qqnorm(scores.g$act_score,
main = "",
ylab = "German scores")
qqline(scores.g$act_score, col = "red")
# Spanish
qqnorm(scores.s$act_score,
main = "",
ylab = "Spanish scores")
qqline(scores.s$act_score, col = "red")
# Add overall title
title("Normal QQ Plots of ACT scores by language", outer = TRUE, cex = 1.5)

#3. Are the variability across groups about equal? There are formal methods to compare variances, but we will not use them in this course. Boxplots are helpful to informally check the equal variance assumption. In particular, if the IQRs are similar in width, then we can consider this assumption met. In our example, the IQRs are similar in width, so we are safe to proceed under the assumption of equal variances.
#Calculate: As with all other hypothesis tests, the goal of the Calculate step is to compute the p-value that corresponds to our stated hypotheses. Fortunately for us, R computes the F test statistic, standard error, and p-value for us when we use the aov() command.
# Build ANOVA model
model.act <- aov(act_score ~ language, data = scores.stacked)
# look at ANOVA results
summary.aov(model.act)
## Df Sum Sq Mean Sq F value Pr(>F)
## language 3 89.09 29.698 5.028 0.00651 **
## Residuals 28 165.37 5.906
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Conclude
#Since p-value=0.00651<0.05=??, we have statistically significant evidence that at least one pair of mean ACT scores are different. That is, we have sufficient evidence to reject H0: ??E=??F=??G=??S.
#PHASE 2
# Run t-test comparisons with no alpha adjustment
pairwise.t.test(x = scores.stacked$act_score,
g = scores.stacked$language,
p.adjust.method = "none")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: scores.stacked$act_score and scores.stacked$language
##
## English French German
## French 0.0035 - -
## German 0.0016 0.7599 -
## Spanish 0.0199 0.4774 0.3124
##
## P value adjustment method: none
#This could have increased Type 1 error
pairwise.t.test(x = scores.stacked$act_score,
g = scores.stacked$language,
p.adjust.method = "bonferroni")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: scores.stacked$act_score and scores.stacked$language
##
## English French German
## French 0.0210 - -
## German 0.0095 1.0000 -
## Spanish 0.1195 1.0000 1.0000
##
## P value adjustment method: bonferroni
#Comparing these adjusted p-values to ??single=0.05 will lead to the same conclusions as comparing the non-adjusted p-values to ??overall???0.0083. We conclude from the data that there are statistically significant differences between English and French average ACT scores and English and German average ACT scores.
# ON YOUR OWN
scores <- read.csv("states_edu.csv")
str(scores)
## 'data.frame': 1280 obs. of 26 variables:
## $ PRIMARY_KEY : Factor w/ 1275 levels "1992_ALABAMA",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ STATE : Factor w/ 51 levels "ALABAMA","ALASKA",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ YEAR : int 1992 1992 1992 1992 1992 1992 1992 1992 1992 1992 ...
## $ REGION : Factor w/ 5 levels "MIDWEST","NORTHEAST",..: 3 5 4 3 5 5 2 3 3 3 ...
## $ ENROLL : int NA NA NA NA NA NA NA NA NA NA ...
## $ TOTAL_REVENUE : int 2678885 1049591 3258079 1711959 26260025 3185173 3834302 645233 709480 11506299 ...
## $ FEDERAL_REVENUE : int 304177 106780 297888 178571 2072470 163253 143542 45945 64749 788420 ...
## $ STATE_REVENUE : int 1659028 720711 1369815 958785 16546514 1307986 1342539 420942 0 5683949 ...
## $ LOCAL_REVENUE : int 715680 222100 1590376 574603 7641041 1713934 2348221 178346 644731 5033930 ...
## $ TOTAL_EXPENDITURE : int 2653798 972488 3401580 1743022 27138832 3264826 3721338 638784 742893 11305642 ...
## $ INSTRUCTION_EXPENDITURE : int 1481703 498362 1435908 964323 14358922 1642466 2148041 372722 329160 5166374 ...
## $ SUPPORT_SERVICES_EXPENDITURE: int 735036 350902 1007732 483488 8520926 1035970 1142600 194915 316679 3410440 ...
## $ OTHER_EXPENDITURE : int NA NA NA NA NA NA NA NA NA NA ...
## $ CAPITAL_OUTLAY_EXPENDITURE : int 174053 37451 609114 145212 2044688 364760 48542 30595 47272 1667826 ...
## $ GRADES_PK_G : int 8224 2371 2544 808 59067 7410 5731 463 4818 31464 ...
## $ GRADES_KG_G : int 55460 10152 53497 33511 431763 47588 41319 8025 6667 161701 ...
## $ GRADES_4_G : int 57948 9748 55433 34632 418418 50648 38058 8272 5832 164416 ...
## $ GRADES_8_G : int 58025 8789 49081 36011 363296 45025 33691 8012 5000 142372 ...
## $ GRADES_12_G : int 41167 6714 37410 27651 270675 34533 28366 6129 3433 100835 ...
## $ GRADES_1_8_G : int 471564 79117 437127 281338 3286034 394904 304284 67495 47009 1276685 ...
## $ GRADES_9_12_G : int 196386 30847 175210 123113 1372011 160299 126917 28338 18173 511557 ...
## $ GRADES_ALL_G : int 676174 112335 614881 405259 4717112 562613 436932 96296 70000 1819706 ...
## $ AVG_MATH_4_SCORE : num 208 NA 215 210 208 ...
## $ AVG_MATH_8_SCORE : num 252 NA 265 256 261 ...
## $ AVG_READING_4_SCORE : num 208 NA 206 209 197 ...
## $ AVG_READING_8_SCORE : num NA 259 262 265 NA ...
summary(scores)
## PRIMARY_KEY STATE YEAR
## 2008_VIRGINIA : 3 DISTRICT_OF_COLUMBIA: 28 Min. :1992
## 2008_DISTRICT_OF_COLUMBIA: 2 VIRGINIA : 27 1st Qu.:1998
## 2009_DISTRICT_OF_COLUMBIA: 2 ALABAMA : 25 Median :2004
## 2010_DISTRICT_OF_COLUMBIA: 2 ALASKA : 25 Mean :2004
## 1992_ALABAMA : 1 ARIZONA : 25 3rd Qu.:2010
## 1992_ALASKA : 1 ARKANSAS : 25 Max. :2016
## (Other) :1269 (Other) :1125
## REGION ENROLL TOTAL_REVENUE FEDERAL_REVENUE
## MIDWEST :300 Min. : 43866 Min. : 465650 Min. : 31020
## NORTHEAST:225 1st Qu.: 258314 1st Qu.: 2186305 1st Qu.: 189354
## SOUTHEAST:380 Median : 648313 Median : 5079546 Median : 403376
## SOUTHWEST:100 Mean : 915931 Mean : 9092082 Mean : 766372
## WEST :275 3rd Qu.:1014528 3rd Qu.:10859848 3rd Qu.: 828966
## Max. :6307022 Max. :89217262 Max. :9990221
## NA's :51
## STATE_REVENUE LOCAL_REVENUE TOTAL_EXPENDITURE
## Min. : 0 Min. : 22093 Min. : 481665
## 1st Qu.: 1153097 1st Qu.: 715834 1st Qu.: 2165404
## Median : 2537074 Median : 2055780 Median : 5234506
## Mean : 4216553 Mean : 4109157 Mean : 9196681
## 3rd Qu.: 5080939 3rd Qu.: 4768680 3rd Qu.:10745191
## Max. :50904567 Max. :36105265 Max. :85320133
##
## INSTRUCTION_EXPENDITURE SUPPORT_SERVICES_EXPENDITURE OTHER_EXPENDITURE
## Min. : 265549 Min. : 139963 Min. : 11541
## 1st Qu.: 1168032 1st Qu.: 635790 1st Qu.: 102831
## Median : 2657452 Median : 1525406 Median : 271596
## Mean : 4762966 Mean : 2680331 Mean : 429205
## 3rd Qu.: 5568028 3rd Qu.: 3229651 3rd Qu.: 518600
## Max. :43964520 Max. :26058021 Max. :3995951
## NA's :51
## CAPITAL_OUTLAY_EXPENDITURE GRADES_PK_G GRADES_KG_G GRADES_4_G
## Min. : 12708 Min. : 0 Min. : 3459 Min. : 2548
## 1st Qu.: 181564 1st Qu.: 2600 1st Qu.: 19603 1st Qu.: 19305
## Median : 510260 Median : 9384 Median : 48841 Median : 49288
## Mean : 902769 Mean : 18969 Mean : 69623 Mean : 70423
## 3rd Qu.: 966852 3rd Qu.: 23981 3rd Qu.: 79396 3rd Qu.: 79078
## Max. :10223657 Max. :250911 Max. :530531 Max. :493415
## NA's :62 NA's :51 NA's :51
## GRADES_8_G GRADES_12_G GRADES_1_8_G GRADES_9_12_G
## Min. : 1485 Min. : 484 Min. : 19226 Min. : 2758
## 1st Qu.: 20091 1st Qu.: 17020 1st Qu.: 156033 1st Qu.: 76118
## Median : 49332 Median : 39634 Median : 399494 Median : 181719
## Mean : 70162 Mean : 59375 Mean : 566614 Mean : 270109
## 3rd Qu.: 80022 3rd Qu.: 69858 3rd Qu.: 638447 3rd Qu.: 310607
## Max. :500143 Max. :498403 Max. :3929869 Max. :2013687
## NA's :51 NA's :51 NA's :51 NA's :51
## GRADES_ALL_G AVG_MATH_4_SCORE AVG_MATH_8_SCORE AVG_READING_4_SCORE
## Min. : 24554 Min. :187.1 Min. :232.8 Min. :178.6
## 1st Qu.: 253825 1st Qu.:228.7 1st Qu.:271.9 1st Qu.:214.4
## Median : 608846 Median :236.8 Median :280.2 Median :220.2
## Mean : 857515 Mean :234.3 Mean :278.0 Mean :218.6
## 3rd Qu.: 967278 3rd Qu.:242.0 3rd Qu.:285.1 3rd Qu.:223.9
## Max. :5944746 Max. :253.4 Max. :300.6 Max. :236.8
## NA's :62 NA's :795 NA's :799 NA's :798
## AVG_READING_8_SCORE
## Min. :236.4
## 1st Qu.:259.2
## Median :264.8
## Mean :263.4
## 3rd Qu.:268.1
## Max. :277.2
## NA's :833
#scores (output is too big for the turn in)
#1 Do 8th grade math scores vary across geographic regions of the United States?
#2 If so, which regions?
#Question 1. How many observations does the dataset contain? How many variables?
#Observations 1280
#Variables 26
#Question 2. What does each observation represent?
# It is data about a state in a certain year(1992-2016)
#Question 3. Create a scatterplot of AVG_MATH_8_SCORE vs. AVG_READING_8_SCORE and interpret the plot.
plot(scores$AVG_READING_8_SCORE, scores$AVG_MATH_8_SCORE, ylab = "AVG_MATH_8_SCORE", xlab = "AVG_READING_8_SCORE")
#Higher math scores are proportional to higher reading scores in most cases
#Question 4.Create a scatterplot of AVG_MATH_8_SCORE vs. AVG_MATH_4_SCORE and interpret the plot.
plot(scores$AVG_MATH_4_SCORE, scores$AVG_READING_8_SCORE , ylab = "AVG_MATH_8_SCORE", xlab = "AVG_MATH_4_SCORE")
#Similar to previous question, the higher math score in 4th is proportion to 8th in most cases.
#Question 5. Create a scatterplot of AVG_MATH_8_SCORE vs. INSTRUCTION_EXPENDITURE and interpret the plot.
plot(scores$INSTRUCTION_EXPENDITURE, scores$AVG_MATH_8_SCORE , ylab = "AVG_MATH_8_SCORE", xlab = "INSTRUCTION_EXPENDITURE")
#In this case the 8th score is not proportional with instuction expenditure as we see that a high math score is related to low instruction expenditure.
#Question 6. How many regions are represented in the data?
RegionScores <- subset(scores, select = c(REGION,STATE))
summary(RegionScores)
## REGION STATE
## MIDWEST :300 DISTRICT_OF_COLUMBIA: 28
## NORTHEAST:225 VIRGINIA : 27
## SOUTHEAST:380 ALABAMA : 25
## SOUTHWEST:100 ALASKA : 25
## WEST :275 ARIZONA : 25
## ARKANSAS : 25
## (Other) :1125
#5 regions
#Question 7. Which region has the most states? The fewest?
Southeast = subset(RegionScores, RegionScores$REGION == "SOUTHEAST")
Southwest = subset(RegionScores, RegionScores$REGION == "SOUTHWEST")
Midwest = subset(RegionScores, RegionScores$REGION == "MIDWEST")
Northeast = subset(RegionScores, RegionScores$REGION == "NORTHEAST")
West = subset(RegionScores, RegionScores$REGION == "WEST")
summary(Southeast$STATE)
## ALABAMA ALASKA ARIZONA
## 25 0 0
## ARKANSAS CALIFORNIA COLORADO
## 25 0 0
## CONNECTICUT DELAWARE DISTRICT_OF_COLUMBIA
## 0 25 28
## FLORIDA GEORGIA HAWAII
## 25 25 0
## IDAHO ILLINOIS INDIANA
## 0 0 0
## IOWA KANSAS KENTUCKY
## 0 0 25
## LOUISIANA MAINE MARYLAND
## 25 0 25
## MASSACHUSETTS MICHIGAN MINNESOTA
## 0 0 0
## MISSISSIPPI MISSOURI MONTANA
## 25 0 0
## NEBRASKA NEVADA NEW_HAMPSHIRE
## 0 0 0
## NEW_JERSEY NEW_MEXICO NEW_YORK
## 0 0 0
## NORTH_CAROLINA NORTH_DAKOTA OHIO
## 25 0 0
## OKLAHOMA OREGON PENNSYLVANIA
## 0 0 0
## RHODE_ISLAND SOUTH_CAROLINA SOUTH_DAKOTA
## 0 25 0
## TENNESSEE TEXAS UTAH
## 25 0 0
## VERMONT VIRGINIA WASHINGTON
## 0 27 0
## WEST_VIRGINIA WISCONSIN WYOMING
## 25 0 0
summary(Southwest$STATE)
## ALABAMA ALASKA ARIZONA
## 0 0 25
## ARKANSAS CALIFORNIA COLORADO
## 0 0 0
## CONNECTICUT DELAWARE DISTRICT_OF_COLUMBIA
## 0 0 0
## FLORIDA GEORGIA HAWAII
## 0 0 0
## IDAHO ILLINOIS INDIANA
## 0 0 0
## IOWA KANSAS KENTUCKY
## 0 0 0
## LOUISIANA MAINE MARYLAND
## 0 0 0
## MASSACHUSETTS MICHIGAN MINNESOTA
## 0 0 0
## MISSISSIPPI MISSOURI MONTANA
## 0 0 0
## NEBRASKA NEVADA NEW_HAMPSHIRE
## 0 0 0
## NEW_JERSEY NEW_MEXICO NEW_YORK
## 0 25 0
## NORTH_CAROLINA NORTH_DAKOTA OHIO
## 0 0 0
## OKLAHOMA OREGON PENNSYLVANIA
## 25 0 0
## RHODE_ISLAND SOUTH_CAROLINA SOUTH_DAKOTA
## 0 0 0
## TENNESSEE TEXAS UTAH
## 0 25 0
## VERMONT VIRGINIA WASHINGTON
## 0 0 0
## WEST_VIRGINIA WISCONSIN WYOMING
## 0 0 0
summary(Midwest$STATE)
## ALABAMA ALASKA ARIZONA
## 0 0 0
## ARKANSAS CALIFORNIA COLORADO
## 0 0 0
## CONNECTICUT DELAWARE DISTRICT_OF_COLUMBIA
## 0 0 0
## FLORIDA GEORGIA HAWAII
## 0 0 0
## IDAHO ILLINOIS INDIANA
## 0 25 25
## IOWA KANSAS KENTUCKY
## 25 25 0
## LOUISIANA MAINE MARYLAND
## 0 0 0
## MASSACHUSETTS MICHIGAN MINNESOTA
## 0 25 25
## MISSISSIPPI MISSOURI MONTANA
## 0 25 0
## NEBRASKA NEVADA NEW_HAMPSHIRE
## 25 0 0
## NEW_JERSEY NEW_MEXICO NEW_YORK
## 0 0 0
## NORTH_CAROLINA NORTH_DAKOTA OHIO
## 0 25 25
## OKLAHOMA OREGON PENNSYLVANIA
## 0 0 0
## RHODE_ISLAND SOUTH_CAROLINA SOUTH_DAKOTA
## 0 0 25
## TENNESSEE TEXAS UTAH
## 0 0 0
## VERMONT VIRGINIA WASHINGTON
## 0 0 0
## WEST_VIRGINIA WISCONSIN WYOMING
## 0 25 0
summary(Northeast$STATE)
## ALABAMA ALASKA ARIZONA
## 0 0 0
## ARKANSAS CALIFORNIA COLORADO
## 0 0 0
## CONNECTICUT DELAWARE DISTRICT_OF_COLUMBIA
## 25 0 0
## FLORIDA GEORGIA HAWAII
## 0 0 0
## IDAHO ILLINOIS INDIANA
## 0 0 0
## IOWA KANSAS KENTUCKY
## 0 0 0
## LOUISIANA MAINE MARYLAND
## 0 25 0
## MASSACHUSETTS MICHIGAN MINNESOTA
## 25 0 0
## MISSISSIPPI MISSOURI MONTANA
## 0 0 0
## NEBRASKA NEVADA NEW_HAMPSHIRE
## 0 0 25
## NEW_JERSEY NEW_MEXICO NEW_YORK
## 25 0 25
## NORTH_CAROLINA NORTH_DAKOTA OHIO
## 0 0 0
## OKLAHOMA OREGON PENNSYLVANIA
## 0 0 25
## RHODE_ISLAND SOUTH_CAROLINA SOUTH_DAKOTA
## 25 0 0
## TENNESSEE TEXAS UTAH
## 0 0 0
## VERMONT VIRGINIA WASHINGTON
## 25 0 0
## WEST_VIRGINIA WISCONSIN WYOMING
## 0 0 0
summary(West$STATE)
## ALABAMA ALASKA ARIZONA
## 0 25 0
## ARKANSAS CALIFORNIA COLORADO
## 0 25 25
## CONNECTICUT DELAWARE DISTRICT_OF_COLUMBIA
## 0 0 0
## FLORIDA GEORGIA HAWAII
## 0 0 25
## IDAHO ILLINOIS INDIANA
## 25 0 0
## IOWA KANSAS KENTUCKY
## 0 0 0
## LOUISIANA MAINE MARYLAND
## 0 0 0
## MASSACHUSETTS MICHIGAN MINNESOTA
## 0 0 0
## MISSISSIPPI MISSOURI MONTANA
## 0 0 25
## NEBRASKA NEVADA NEW_HAMPSHIRE
## 0 25 0
## NEW_JERSEY NEW_MEXICO NEW_YORK
## 0 0 0
## NORTH_CAROLINA NORTH_DAKOTA OHIO
## 0 0 0
## OKLAHOMA OREGON PENNSYLVANIA
## 0 25 0
## RHODE_ISLAND SOUTH_CAROLINA SOUTH_DAKOTA
## 0 0 0
## TENNESSEE TEXAS UTAH
## 0 0 25
## VERMONT VIRGINIA WASHINGTON
## 0 0 25
## WEST_VIRGINIA WISCONSIN WYOMING
## 0 0 25
#Most: Southeast(15), Least:Southwest(4)
#Question 8.Create a boxplot of AVG_MATH_8_SCORE vs. REGION. Add points to show the within-region means and a horizontal line to show the overall mean. You may need to review how to handle missing data.
scores.stacked = subset(scores,select = c("REGION", "AVG_MATH_8_SCORE"))
colnames(scores.stacked) = c("REGION", "AVG_MATH_8_SCORE")
head(scores.stacked)
## REGION AVG_MATH_8_SCORE
## 1 SOUTHEAST 252.1875
## 2 WEST NA
## 3 SOUTHWEST 265.3663
## 4 SOUTHEAST 256.3121
## 5 WEST 260.8922
## 6 WEST 272.3984
boxplot(AVG_MATH_8_SCORE~REGION, data = scores.stacked,main = "AVG_MATH_8_SCORE vs. REGION", xlab = "REGION", ylab = "AVG_MATH_8_SCORE", ylin = c(230,320),col = "red",cex.axis = .8)
means = aggregate(AVG_MATH_8_SCORE~REGION, scores.stacked, mean)
points(means, col="blue", pch=18)
abline(h = mean(na.omit(scores.stacked$AVG_MATH_8_SCORE)),col = "blue", lwd = 2,lty = 2)
legend("bottomleft",legend = c("Within-Region mean", "Overall mean"),pch = c(18, NA),lty = c(NA, 2),col = c("black", "blue"))

#Question 9. Interpret the boxplot. What does the boxplot suggest regarding the research questions?
#We see that the average math scores vary, the two south regions have mostly scores below the overal mean. The midwest and northeast have scores above the overall mean and the west is slightly above the overall mean. In conclusion we see the highest scores in the midwest and north east, and the lowest in the southeast and southwest.
#Question 10. Perform an analysis of variance that answers the first research question. Use the PCCC method.
#Prepare
#H0 = ??M=??N=??W=??SE=??SW
#Ha =! H0
#Check
scores.M = na.omit(subset(scores.stacked, scores.stacked$REGION == "MIDWEST"))
scores.N = na.omit(subset(scores.stacked, scores.stacked$REGION == "NORTHEAST"))
scores.SE = na.omit(subset(scores.stacked, scores.stacked$REGION == "SOUTHEAST"))
scores.SW = na.omit(subset(scores.stacked, scores.stacked$REGION == "SOUTHWEST"))
scores.W = na.omit(subset(scores.stacked, scores.stacked$REGION == "WEST"))
par(mfrow = c(2,3), oma = c(0,0,2,0))
#Histograms
hist(scores.M$AVG_MATH_8_SCORE,main = "",xlab = "MIDWEST",col = "light blue")
hist(scores.N$AVG_MATH_8_SCORE,main = "",xlab = "NORTH",col = "light blue")
hist(scores.SE$AVG_MATH_8_SCORE,main = "",xlab = "SOUTHEAST",col = "light blue")
hist(scores.SW$AVG_MATH_8_SCORE,main = "",xlab = "SOUTHWEST",col = "light blue")
hist(scores.W$AVG_MATH_8_SCORE,main = "",xlab = "WEST",col = "light blue")
title("Histograms of AVG_MATH_8_SCORE by REGION", outer = TRUE, cex = 1.5)
par(mfrow = c(2,3), oma = c(0,0,2,0))

#q-q plots
qqnorm(scores.M$AVG_MATH_8_SCORE,main = "",ylab = "Midwest")
qqline(scores.M$AVG_MATH_8_SCORE, col = "red")
qqnorm(scores.N$AVG_MATH_8_SCORE,main = "",ylab = "Northeast")
qqline(scores.N$AVG_MATH_8_SCORE, col = "red")
qqnorm(scores.SE$AVG_MATH_8_SCORE,main = "",ylab = "Southeast")
qqline(scores.SE$AVG_MATH_8_SCORE, col = "red")
qqnorm(scores.SW$AVG_MATH_8_SCORE,main = "",ylab = "Southwest")
qqline(scores.SW$AVG_MATH_8_SCORE, col = "red")
qqnorm(scores.W$AVG_MATH_8_SCORE,main = "",ylab = "West")
qqline(scores.W$AVG_MATH_8_SCORE, col = "red")
title("Normal QQ Plots of ACT scores by language", outer = TRUE, cex = 1.5)
#We assume its independent
#The data is normal
#Variance is slight everywhere except in the southeast.
#Calculate
model.act <- aov(AVG_MATH_8_SCORE ~ REGION, data = scores.stacked)
summary.aov(model.act)
## Df Sum Sq Mean Sq F value Pr(>F)
## REGION 4 16152 4038 54.22 <2e-16 ***
## Residuals 476 35451 74
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 799 observations deleted due to missingness
#Conclude
#p=0 --> p<alpha this means that there is at least on pair of means who are different.
# Question 11. Make pairwise comparisons, and answer the second research question. Use an overall significance level of αoverall=0.05.
pairwise.t.test(x = scores.stacked$AVG_MATH_8_SCORE, g = scores.stacked$REGION, p.adjust.method = "bonferroni")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: scores.stacked$AVG_MATH_8_SCORE and scores.stacked$REGION
##
## MIDWEST NORTHEAST SOUTHEAST SOUTHWEST
## NORTHEAST 1.0000 - - -
## SOUTHEAST < 2e-16 < 2e-16 - -
## SOUTHWEST 5.1e-09 8.1e-09 0.4591 -
## WEST 0.0025 0.0027 1.0e-13 0.0040
##
## P value adjustment method: bonferroni
#if p<.05 reject otherwise...
#fail to reject--> no statistic significance that there is a difference in means.
#Midwest
#H0: ??M?????N=0 fail to reject
#H0: ??M?????SE=0 reject
#H0: ??M?????SW=0 reject
#H0: ??M?????W=0 reject
#Northeast
#H0: ??N?????SE=0 reject
#H0: ??N?????SW=0 reject
#H0: ??N?????W=0 reject
#Southeast
#H0: ??SE?????SW=0 fail to reject
#H0: ??SE?????W=0 reject
#Southwest
#H0: ??SW?????W=0 reject
#12. Propose a possible explanation for your findings.
#From the boxplot created above we can see that the mean lines odf the midwest and northeast are close to each other, same relationship but below the overall mean have the southern regions. This is the reason they failed to reject the null. Possible reasons for such results can be the quality of math teachers in that region, student's motivation, student's focus in school etc.
#Documentation: C2C Sermipong helped me on questions 7, 10 and 11. I also looked at the posts on teams regarding the lab, some people gave advice on how to do the questions.
