#Zhivko Kolevski T7
#ANOVA LAB

#Motivating Example

scores <- read.csv("ACT Sample Scores.csv")
str(scores)
## 'data.frame':    8 obs. of  4 variables:
##  $ English: int  28 26 24 29 30 24 25 26
##  $ French : int  33 29 31 31 30 29 28 32
##  $ German : int  33 31 29 30 32 29 28 34
##  $ Spanish: int  31 29 26 30 24 29 34 33
summary(scores)
##     English          French          German         Spanish     
##  Min.   :24.00   Min.   :28.00   Min.   :28.00   Min.   :24.00  
##  1st Qu.:24.75   1st Qu.:29.00   1st Qu.:29.00   1st Qu.:28.25  
##  Median :26.00   Median :30.50   Median :30.50   Median :29.50  
##  Mean   :26.50   Mean   :30.38   Mean   :30.75   Mean   :29.50  
##  3rd Qu.:28.25   3rd Qu.:31.25   3rd Qu.:32.25   3rd Qu.:31.50  
##  Max.   :30.00   Max.   :33.00   Max.   :34.00   Max.   :34.00
scores
##   English French German Spanish
## 1      28     33     33      31
## 2      26     29     31      29
## 3      24     31     29      26
## 4      29     31     30      30
## 5      30     30     32      24
## 6      24     29     29      29
## 7      25     28     28      34
## 8      26     32     34      33
# Stack the data
scores.stacked = stack(scores)

# View data structure
str(scores.stacked)
## 'data.frame':    32 obs. of  2 variables:
##  $ values: int  28 26 24 29 30 24 25 26 33 29 ...
##  $ ind   : Factor w/ 4 levels "English","French",..: 1 1 1 1 1 1 1 1 2 2 ...
# View data summary
summary(scores.stacked)
##      values           ind   
##  Min.   :24.00   English:8  
##  1st Qu.:28.00   French :8  
##  Median :29.00   German :8  
##  Mean   :29.28   Spanish:8  
##  3rd Qu.:31.00              
##  Max.   :34.00
# View first 10 rows of data
head(scores.stacked, 10)
##    values     ind
## 1      28 English
## 2      26 English
## 3      24 English
## 4      29 English
## 5      30 English
## 6      24 English
## 7      25 English
## 8      26 English
## 9      33  French
## 10     29  French
# Chance column names
colnames(scores.stacked) = c("act_score", "language")

# View first ten rows of data
head(scores.stacked, 10)
##    act_score language
## 1         28  English
## 2         26  English
## 3         24  English
## 4         29  English
## 5         30  English
## 6         24  English
## 7         25  English
## 8         26  English
## 9         33   French
## 10        29   French
#Vizualize the data
# Baseline boxplot
boxplot(act_score~language, 
        data = scores.stacked,
        main = "Test Scores by Language", 
        xlab = "Language", 
        ylab = "ACT Score", 
        ylim = c(24, 36),  # Increase y limit to make room for legend
        col = "light blue")

# Calculate the means per language
means <- aggregate(act_score ~  language, scores.stacked, mean)

# Plot means as points on boxplot
points(means, col="black", pch=18)

# Plot horizontal line for overall mean
abline(h = mean(scores.stacked$act_score), 
       col = "red", 
       lwd = 2,  # change line width
       lty = 2)  # change line type (dashed line)

# Add legend
legend("topleft",
       legend = c("Within-language mean", "Overall mean"),
       pch = c(18, NA),
       lty = c(NA, 2),
       col = c("black", "red"))

#PHASE 1
#prepare
#H0: μE=μF=μG=μS 
#Ha: not H0

#check
#1.Are the observations independent within and across groups? 
#Based on the description of the data, we do not have reason to believe that information about one observation would give us additional information about an observation within the same group or in a different group. We may assume independence.

#2.Are the data within each group are nearly normal? To test this observation, we should view histograms and Normal Q-Q plots. This observation only needs to be loosely held. So long as the data appear unimodal and mound shaped, we won’t have any serious reasons for concern.

# Create subsets to make plotting easier
scores.e = subset(scores.stacked, scores.stacked$language == "English")
scores.f = subset(scores.stacked, scores.stacked$language == "French")
scores.g = subset(scores.stacked, scores.stacked$language == "German")
scores.s = subset(scores.stacked, scores.stacked$language == "Spanish")

#Set 2x2 plot window with increased upper margin for title
par(mfrow = c(2,2), oma = c(0,0,2,0))

#Plot histograms
hist(scores.e$act_score,
     main = "",
     xlab = "English scores",
     col = "light blue")
hist(scores.f$act_score,
     main = "",
     xlab = "French scores",
     col = "light blue")
hist(scores.g$act_score,
     main = "",
     xlab = "German scores",
     col = "light blue")
hist(scores.s$act_score,
     main = "",
     xlab = "Spanish scores",
     col = "light blue")

# Add overall title
title("Histograms of ACT scores by language", outer = TRUE, cex = 1.5)

#Set 2x2 plot window with increased upper margin for title
par(mfrow = c(2,2), oma = c(0,0,2,0))

#Plot q-q plots

#English
qqnorm(scores.e$act_score,
       main = "",
       ylab = "English scores")
qqline(scores.e$act_score, col = "red")

# French
qqnorm(scores.f$act_score,
       main = "",
       ylab = "French scores")
qqline(scores.f$act_score, col = "red")

# German
qqnorm(scores.g$act_score,
       main = "",
       ylab = "German scores")
qqline(scores.g$act_score, col = "red")

# Spanish
qqnorm(scores.s$act_score,
       main = "",
       ylab = "Spanish scores")
qqline(scores.s$act_score, col = "red")

# Add overall title
title("Normal QQ Plots of ACT scores by language", outer = TRUE, cex = 1.5)

#3. Are the variability across groups about equal? There are formal methods to compare variances, but we will not use them in this course. Boxplots are helpful to informally check the equal variance assumption. In particular, if the IQRs are similar in width, then we can consider this assumption met. In our example, the IQRs are similar in width, so we are safe to proceed under the assumption of equal variances.

#Calculate: As with all other hypothesis tests, the goal of the Calculate step is to compute the p-value that corresponds to our stated hypotheses. Fortunately for us, R computes the F test statistic, standard error, and p-value for us when we use the aov() command.

# Build ANOVA model
model.act <- aov(act_score ~ language, data = scores.stacked)

# look at ANOVA results
summary.aov(model.act)
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## language     3  89.09  29.698   5.028 0.00651 **
## Residuals   28 165.37   5.906                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Conclude
#Since p-value=0.00651<0.05=??, we have statistically significant evidence that at least one pair of mean ACT scores are different. That is, we have sufficient evidence to reject H0:  ??E=??F=??G=??S.

#PHASE 2
# Run t-test comparisons with no alpha adjustment 
pairwise.t.test(x = scores.stacked$act_score, 
                g = scores.stacked$language, 
                p.adjust.method = "none")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  scores.stacked$act_score and scores.stacked$language 
## 
##         English French German
## French  0.0035  -      -     
## German  0.0016  0.7599 -     
## Spanish 0.0199  0.4774 0.3124
## 
## P value adjustment method: none
#This could have increased Type 1 error


pairwise.t.test(x = scores.stacked$act_score, 
                g = scores.stacked$language, 
                p.adjust.method = "bonferroni")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  scores.stacked$act_score and scores.stacked$language 
## 
##         English French German
## French  0.0210  -      -     
## German  0.0095  1.0000 -     
## Spanish 0.1195  1.0000 1.0000
## 
## P value adjustment method: bonferroni
#Comparing these adjusted p-values to ??single=0.05 will lead to the same conclusions as comparing the non-adjusted p-values to ??overall???0.0083. We conclude from the data that there are statistically significant differences between English and French average ACT scores and English and German average ACT scores.

# ON YOUR OWN
scores <- read.csv("states_edu.csv")
str(scores)
## 'data.frame':    1280 obs. of  26 variables:
##  $ PRIMARY_KEY                 : Factor w/ 1275 levels "1992_ALABAMA",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ STATE                       : Factor w/ 51 levels "ALABAMA","ALASKA",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ YEAR                        : int  1992 1992 1992 1992 1992 1992 1992 1992 1992 1992 ...
##  $ REGION                      : Factor w/ 5 levels "MIDWEST","NORTHEAST",..: 3 5 4 3 5 5 2 3 3 3 ...
##  $ ENROLL                      : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ TOTAL_REVENUE               : int  2678885 1049591 3258079 1711959 26260025 3185173 3834302 645233 709480 11506299 ...
##  $ FEDERAL_REVENUE             : int  304177 106780 297888 178571 2072470 163253 143542 45945 64749 788420 ...
##  $ STATE_REVENUE               : int  1659028 720711 1369815 958785 16546514 1307986 1342539 420942 0 5683949 ...
##  $ LOCAL_REVENUE               : int  715680 222100 1590376 574603 7641041 1713934 2348221 178346 644731 5033930 ...
##  $ TOTAL_EXPENDITURE           : int  2653798 972488 3401580 1743022 27138832 3264826 3721338 638784 742893 11305642 ...
##  $ INSTRUCTION_EXPENDITURE     : int  1481703 498362 1435908 964323 14358922 1642466 2148041 372722 329160 5166374 ...
##  $ SUPPORT_SERVICES_EXPENDITURE: int  735036 350902 1007732 483488 8520926 1035970 1142600 194915 316679 3410440 ...
##  $ OTHER_EXPENDITURE           : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ CAPITAL_OUTLAY_EXPENDITURE  : int  174053 37451 609114 145212 2044688 364760 48542 30595 47272 1667826 ...
##  $ GRADES_PK_G                 : int  8224 2371 2544 808 59067 7410 5731 463 4818 31464 ...
##  $ GRADES_KG_G                 : int  55460 10152 53497 33511 431763 47588 41319 8025 6667 161701 ...
##  $ GRADES_4_G                  : int  57948 9748 55433 34632 418418 50648 38058 8272 5832 164416 ...
##  $ GRADES_8_G                  : int  58025 8789 49081 36011 363296 45025 33691 8012 5000 142372 ...
##  $ GRADES_12_G                 : int  41167 6714 37410 27651 270675 34533 28366 6129 3433 100835 ...
##  $ GRADES_1_8_G                : int  471564 79117 437127 281338 3286034 394904 304284 67495 47009 1276685 ...
##  $ GRADES_9_12_G               : int  196386 30847 175210 123113 1372011 160299 126917 28338 18173 511557 ...
##  $ GRADES_ALL_G                : int  676174 112335 614881 405259 4717112 562613 436932 96296 70000 1819706 ...
##  $ AVG_MATH_4_SCORE            : num  208 NA 215 210 208 ...
##  $ AVG_MATH_8_SCORE            : num  252 NA 265 256 261 ...
##  $ AVG_READING_4_SCORE         : num  208 NA 206 209 197 ...
##  $ AVG_READING_8_SCORE         : num  NA 259 262 265 NA ...
summary(scores)
##                     PRIMARY_KEY                    STATE           YEAR     
##  2008_VIRGINIA            :   3   DISTRICT_OF_COLUMBIA:  28   Min.   :1992  
##  2008_DISTRICT_OF_COLUMBIA:   2   VIRGINIA            :  27   1st Qu.:1998  
##  2009_DISTRICT_OF_COLUMBIA:   2   ALABAMA             :  25   Median :2004  
##  2010_DISTRICT_OF_COLUMBIA:   2   ALASKA              :  25   Mean   :2004  
##  1992_ALABAMA             :   1   ARIZONA             :  25   3rd Qu.:2010  
##  1992_ALASKA              :   1   ARKANSAS            :  25   Max.   :2016  
##  (Other)                  :1269   (Other)             :1125                 
##        REGION        ENROLL        TOTAL_REVENUE      FEDERAL_REVENUE  
##  MIDWEST  :300   Min.   :  43866   Min.   :  465650   Min.   :  31020  
##  NORTHEAST:225   1st Qu.: 258314   1st Qu.: 2186305   1st Qu.: 189354  
##  SOUTHEAST:380   Median : 648313   Median : 5079546   Median : 403376  
##  SOUTHWEST:100   Mean   : 915931   Mean   : 9092082   Mean   : 766372  
##  WEST     :275   3rd Qu.:1014528   3rd Qu.:10859848   3rd Qu.: 828966  
##                  Max.   :6307022   Max.   :89217262   Max.   :9990221  
##                  NA's   :51                                            
##  STATE_REVENUE      LOCAL_REVENUE      TOTAL_EXPENDITURE 
##  Min.   :       0   Min.   :   22093   Min.   :  481665  
##  1st Qu.: 1153097   1st Qu.:  715834   1st Qu.: 2165404  
##  Median : 2537074   Median : 2055780   Median : 5234506  
##  Mean   : 4216553   Mean   : 4109157   Mean   : 9196681  
##  3rd Qu.: 5080939   3rd Qu.: 4768680   3rd Qu.:10745191  
##  Max.   :50904567   Max.   :36105265   Max.   :85320133  
##                                                          
##  INSTRUCTION_EXPENDITURE SUPPORT_SERVICES_EXPENDITURE OTHER_EXPENDITURE
##  Min.   :  265549        Min.   :  139963             Min.   :  11541  
##  1st Qu.: 1168032        1st Qu.:  635790             1st Qu.: 102831  
##  Median : 2657452        Median : 1525406             Median : 271596  
##  Mean   : 4762966        Mean   : 2680331             Mean   : 429205  
##  3rd Qu.: 5568028        3rd Qu.: 3229651             3rd Qu.: 518600  
##  Max.   :43964520        Max.   :26058021             Max.   :3995951  
##                                                       NA's   :51       
##  CAPITAL_OUTLAY_EXPENDITURE  GRADES_PK_G      GRADES_KG_G       GRADES_4_G    
##  Min.   :   12708           Min.   :     0   Min.   :  3459   Min.   :  2548  
##  1st Qu.:  181564           1st Qu.:  2600   1st Qu.: 19603   1st Qu.: 19305  
##  Median :  510260           Median :  9384   Median : 48841   Median : 49288  
##  Mean   :  902769           Mean   : 18969   Mean   : 69623   Mean   : 70423  
##  3rd Qu.:  966852           3rd Qu.: 23981   3rd Qu.: 79396   3rd Qu.: 79078  
##  Max.   :10223657           Max.   :250911   Max.   :530531   Max.   :493415  
##                             NA's   :62       NA's   :51       NA's   :51      
##    GRADES_8_G      GRADES_12_G      GRADES_1_8_G     GRADES_9_12_G    
##  Min.   :  1485   Min.   :   484   Min.   :  19226   Min.   :   2758  
##  1st Qu.: 20091   1st Qu.: 17020   1st Qu.: 156033   1st Qu.:  76118  
##  Median : 49332   Median : 39634   Median : 399494   Median : 181719  
##  Mean   : 70162   Mean   : 59375   Mean   : 566614   Mean   : 270109  
##  3rd Qu.: 80022   3rd Qu.: 69858   3rd Qu.: 638447   3rd Qu.: 310607  
##  Max.   :500143   Max.   :498403   Max.   :3929869   Max.   :2013687  
##  NA's   :51       NA's   :51       NA's   :51        NA's   :51       
##   GRADES_ALL_G     AVG_MATH_4_SCORE AVG_MATH_8_SCORE AVG_READING_4_SCORE
##  Min.   :  24554   Min.   :187.1    Min.   :232.8    Min.   :178.6      
##  1st Qu.: 253825   1st Qu.:228.7    1st Qu.:271.9    1st Qu.:214.4      
##  Median : 608846   Median :236.8    Median :280.2    Median :220.2      
##  Mean   : 857515   Mean   :234.3    Mean   :278.0    Mean   :218.6      
##  3rd Qu.: 967278   3rd Qu.:242.0    3rd Qu.:285.1    3rd Qu.:223.9      
##  Max.   :5944746   Max.   :253.4    Max.   :300.6    Max.   :236.8      
##  NA's   :62        NA's   :795      NA's   :799      NA's   :798        
##  AVG_READING_8_SCORE
##  Min.   :236.4      
##  1st Qu.:259.2      
##  Median :264.8      
##  Mean   :263.4      
##  3rd Qu.:268.1      
##  Max.   :277.2      
##  NA's   :833
#scores (output is too big for the turn in)

#1 Do 8th grade math scores vary across geographic regions of the United States? 
#2 If so, which regions?

#Question 1. How many observations does the dataset contain? How many variables?
#Observations 1280
#Variables 26

#Question 2. What does each observation represent?
# It is data about a state in a certain year(1992-2016)

#Question 3. Create a scatterplot of AVG_MATH_8_SCORE vs. AVG_READING_8_SCORE and interpret the plot.
plot(scores$AVG_READING_8_SCORE, scores$AVG_MATH_8_SCORE, ylab = "AVG_MATH_8_SCORE", xlab = "AVG_READING_8_SCORE")
#Higher math scores are proportional to higher reading scores in most cases

#Question 4.Create a scatterplot of AVG_MATH_8_SCORE vs. AVG_MATH_4_SCORE and interpret the plot.
plot(scores$AVG_MATH_4_SCORE, scores$AVG_READING_8_SCORE , ylab = "AVG_MATH_8_SCORE", xlab = "AVG_MATH_4_SCORE")
#Similar to previous question, the higher math score in 4th is proportion to 8th in most cases.

#Question 5. Create a scatterplot of AVG_MATH_8_SCORE vs. INSTRUCTION_EXPENDITURE and interpret the plot.
plot(scores$INSTRUCTION_EXPENDITURE, scores$AVG_MATH_8_SCORE , ylab = "AVG_MATH_8_SCORE", xlab = "INSTRUCTION_EXPENDITURE")
#In this case the 8th score is not proportional with instuction expenditure as we see that a high math score is related to low instruction expenditure.

#Question 6. How many regions are represented in the data?
RegionScores <- subset(scores, select = c(REGION,STATE))
summary(RegionScores)
##        REGION                     STATE     
##  MIDWEST  :300   DISTRICT_OF_COLUMBIA:  28  
##  NORTHEAST:225   VIRGINIA            :  27  
##  SOUTHEAST:380   ALABAMA             :  25  
##  SOUTHWEST:100   ALASKA              :  25  
##  WEST     :275   ARIZONA             :  25  
##                  ARKANSAS            :  25  
##                  (Other)             :1125
#5 regions

#Question 7. Which region has the most states? The fewest?

Southeast = subset(RegionScores, RegionScores$REGION == "SOUTHEAST")
Southwest = subset(RegionScores, RegionScores$REGION == "SOUTHWEST")
Midwest = subset(RegionScores, RegionScores$REGION == "MIDWEST")
Northeast = subset(RegionScores, RegionScores$REGION == "NORTHEAST")
West = subset(RegionScores, RegionScores$REGION == "WEST")

summary(Southeast$STATE)
##              ALABAMA               ALASKA              ARIZONA 
##                   25                    0                    0 
##             ARKANSAS           CALIFORNIA             COLORADO 
##                   25                    0                    0 
##          CONNECTICUT             DELAWARE DISTRICT_OF_COLUMBIA 
##                    0                   25                   28 
##              FLORIDA              GEORGIA               HAWAII 
##                   25                   25                    0 
##                IDAHO             ILLINOIS              INDIANA 
##                    0                    0                    0 
##                 IOWA               KANSAS             KENTUCKY 
##                    0                    0                   25 
##            LOUISIANA                MAINE             MARYLAND 
##                   25                    0                   25 
##        MASSACHUSETTS             MICHIGAN            MINNESOTA 
##                    0                    0                    0 
##          MISSISSIPPI             MISSOURI              MONTANA 
##                   25                    0                    0 
##             NEBRASKA               NEVADA        NEW_HAMPSHIRE 
##                    0                    0                    0 
##           NEW_JERSEY           NEW_MEXICO             NEW_YORK 
##                    0                    0                    0 
##       NORTH_CAROLINA         NORTH_DAKOTA                 OHIO 
##                   25                    0                    0 
##             OKLAHOMA               OREGON         PENNSYLVANIA 
##                    0                    0                    0 
##         RHODE_ISLAND       SOUTH_CAROLINA         SOUTH_DAKOTA 
##                    0                   25                    0 
##            TENNESSEE                TEXAS                 UTAH 
##                   25                    0                    0 
##              VERMONT             VIRGINIA           WASHINGTON 
##                    0                   27                    0 
##        WEST_VIRGINIA            WISCONSIN              WYOMING 
##                   25                    0                    0
summary(Southwest$STATE) 
##              ALABAMA               ALASKA              ARIZONA 
##                    0                    0                   25 
##             ARKANSAS           CALIFORNIA             COLORADO 
##                    0                    0                    0 
##          CONNECTICUT             DELAWARE DISTRICT_OF_COLUMBIA 
##                    0                    0                    0 
##              FLORIDA              GEORGIA               HAWAII 
##                    0                    0                    0 
##                IDAHO             ILLINOIS              INDIANA 
##                    0                    0                    0 
##                 IOWA               KANSAS             KENTUCKY 
##                    0                    0                    0 
##            LOUISIANA                MAINE             MARYLAND 
##                    0                    0                    0 
##        MASSACHUSETTS             MICHIGAN            MINNESOTA 
##                    0                    0                    0 
##          MISSISSIPPI             MISSOURI              MONTANA 
##                    0                    0                    0 
##             NEBRASKA               NEVADA        NEW_HAMPSHIRE 
##                    0                    0                    0 
##           NEW_JERSEY           NEW_MEXICO             NEW_YORK 
##                    0                   25                    0 
##       NORTH_CAROLINA         NORTH_DAKOTA                 OHIO 
##                    0                    0                    0 
##             OKLAHOMA               OREGON         PENNSYLVANIA 
##                   25                    0                    0 
##         RHODE_ISLAND       SOUTH_CAROLINA         SOUTH_DAKOTA 
##                    0                    0                    0 
##            TENNESSEE                TEXAS                 UTAH 
##                    0                   25                    0 
##              VERMONT             VIRGINIA           WASHINGTON 
##                    0                    0                    0 
##        WEST_VIRGINIA            WISCONSIN              WYOMING 
##                    0                    0                    0
summary(Midwest$STATE) 
##              ALABAMA               ALASKA              ARIZONA 
##                    0                    0                    0 
##             ARKANSAS           CALIFORNIA             COLORADO 
##                    0                    0                    0 
##          CONNECTICUT             DELAWARE DISTRICT_OF_COLUMBIA 
##                    0                    0                    0 
##              FLORIDA              GEORGIA               HAWAII 
##                    0                    0                    0 
##                IDAHO             ILLINOIS              INDIANA 
##                    0                   25                   25 
##                 IOWA               KANSAS             KENTUCKY 
##                   25                   25                    0 
##            LOUISIANA                MAINE             MARYLAND 
##                    0                    0                    0 
##        MASSACHUSETTS             MICHIGAN            MINNESOTA 
##                    0                   25                   25 
##          MISSISSIPPI             MISSOURI              MONTANA 
##                    0                   25                    0 
##             NEBRASKA               NEVADA        NEW_HAMPSHIRE 
##                   25                    0                    0 
##           NEW_JERSEY           NEW_MEXICO             NEW_YORK 
##                    0                    0                    0 
##       NORTH_CAROLINA         NORTH_DAKOTA                 OHIO 
##                    0                   25                   25 
##             OKLAHOMA               OREGON         PENNSYLVANIA 
##                    0                    0                    0 
##         RHODE_ISLAND       SOUTH_CAROLINA         SOUTH_DAKOTA 
##                    0                    0                   25 
##            TENNESSEE                TEXAS                 UTAH 
##                    0                    0                    0 
##              VERMONT             VIRGINIA           WASHINGTON 
##                    0                    0                    0 
##        WEST_VIRGINIA            WISCONSIN              WYOMING 
##                    0                   25                    0
summary(Northeast$STATE) 
##              ALABAMA               ALASKA              ARIZONA 
##                    0                    0                    0 
##             ARKANSAS           CALIFORNIA             COLORADO 
##                    0                    0                    0 
##          CONNECTICUT             DELAWARE DISTRICT_OF_COLUMBIA 
##                   25                    0                    0 
##              FLORIDA              GEORGIA               HAWAII 
##                    0                    0                    0 
##                IDAHO             ILLINOIS              INDIANA 
##                    0                    0                    0 
##                 IOWA               KANSAS             KENTUCKY 
##                    0                    0                    0 
##            LOUISIANA                MAINE             MARYLAND 
##                    0                   25                    0 
##        MASSACHUSETTS             MICHIGAN            MINNESOTA 
##                   25                    0                    0 
##          MISSISSIPPI             MISSOURI              MONTANA 
##                    0                    0                    0 
##             NEBRASKA               NEVADA        NEW_HAMPSHIRE 
##                    0                    0                   25 
##           NEW_JERSEY           NEW_MEXICO             NEW_YORK 
##                   25                    0                   25 
##       NORTH_CAROLINA         NORTH_DAKOTA                 OHIO 
##                    0                    0                    0 
##             OKLAHOMA               OREGON         PENNSYLVANIA 
##                    0                    0                   25 
##         RHODE_ISLAND       SOUTH_CAROLINA         SOUTH_DAKOTA 
##                   25                    0                    0 
##            TENNESSEE                TEXAS                 UTAH 
##                    0                    0                    0 
##              VERMONT             VIRGINIA           WASHINGTON 
##                   25                    0                    0 
##        WEST_VIRGINIA            WISCONSIN              WYOMING 
##                    0                    0                    0
summary(West$STATE)
##              ALABAMA               ALASKA              ARIZONA 
##                    0                   25                    0 
##             ARKANSAS           CALIFORNIA             COLORADO 
##                    0                   25                   25 
##          CONNECTICUT             DELAWARE DISTRICT_OF_COLUMBIA 
##                    0                    0                    0 
##              FLORIDA              GEORGIA               HAWAII 
##                    0                    0                   25 
##                IDAHO             ILLINOIS              INDIANA 
##                   25                    0                    0 
##                 IOWA               KANSAS             KENTUCKY 
##                    0                    0                    0 
##            LOUISIANA                MAINE             MARYLAND 
##                    0                    0                    0 
##        MASSACHUSETTS             MICHIGAN            MINNESOTA 
##                    0                    0                    0 
##          MISSISSIPPI             MISSOURI              MONTANA 
##                    0                    0                   25 
##             NEBRASKA               NEVADA        NEW_HAMPSHIRE 
##                    0                   25                    0 
##           NEW_JERSEY           NEW_MEXICO             NEW_YORK 
##                    0                    0                    0 
##       NORTH_CAROLINA         NORTH_DAKOTA                 OHIO 
##                    0                    0                    0 
##             OKLAHOMA               OREGON         PENNSYLVANIA 
##                    0                   25                    0 
##         RHODE_ISLAND       SOUTH_CAROLINA         SOUTH_DAKOTA 
##                    0                    0                    0 
##            TENNESSEE                TEXAS                 UTAH 
##                    0                    0                   25 
##              VERMONT             VIRGINIA           WASHINGTON 
##                    0                    0                   25 
##        WEST_VIRGINIA            WISCONSIN              WYOMING 
##                    0                    0                   25
#Most: Southeast(15), Least:Southwest(4)

#Question 8.Create a boxplot of AVG_MATH_8_SCORE vs. REGION. Add points to show the within-region means and a horizontal line to show the overall mean. You may need to review how to handle missing data.
scores.stacked = subset(scores,select = c("REGION", "AVG_MATH_8_SCORE"))
colnames(scores.stacked) = c("REGION", "AVG_MATH_8_SCORE")
head(scores.stacked)
##      REGION AVG_MATH_8_SCORE
## 1 SOUTHEAST         252.1875
## 2      WEST               NA
## 3 SOUTHWEST         265.3663
## 4 SOUTHEAST         256.3121
## 5      WEST         260.8922
## 6      WEST         272.3984
boxplot(AVG_MATH_8_SCORE~REGION, data = scores.stacked,main = "AVG_MATH_8_SCORE vs. REGION", xlab = "REGION", ylab = "AVG_MATH_8_SCORE", ylin = c(230,320),col = "red",cex.axis = .8)

means = aggregate(AVG_MATH_8_SCORE~REGION, scores.stacked, mean)

points(means, col="blue", pch=18)

abline(h = mean(na.omit(scores.stacked$AVG_MATH_8_SCORE)),col = "blue", lwd = 2,lty = 2)

legend("bottomleft",legend = c("Within-Region mean", "Overall mean"),pch = c(18, NA),lty = c(NA, 2),col = c("black", "blue"))

#Question 9. Interpret the boxplot. What does the boxplot suggest regarding the research questions?
#We see that the average math scores vary, the two south regions have mostly scores below the overal mean. The midwest and northeast have scores above the overall mean and the west is slightly above the overall mean. In conclusion we see the highest scores in the midwest and north east, and the lowest in the southeast and southwest.

#Question 10. Perform an analysis of variance that answers the first research question. Use the PCCC method.

#Prepare
#H0 = ??M=??N=??W=??SE=??SW
#Ha =! H0

#Check
scores.M = na.omit(subset(scores.stacked, scores.stacked$REGION == "MIDWEST"))
scores.N = na.omit(subset(scores.stacked, scores.stacked$REGION == "NORTHEAST"))
scores.SE = na.omit(subset(scores.stacked, scores.stacked$REGION == "SOUTHEAST"))
scores.SW = na.omit(subset(scores.stacked, scores.stacked$REGION == "SOUTHWEST"))
scores.W = na.omit(subset(scores.stacked, scores.stacked$REGION == "WEST"))

par(mfrow = c(2,3), oma = c(0,0,2,0))

#Histograms
hist(scores.M$AVG_MATH_8_SCORE,main = "",xlab = "MIDWEST",col = "light blue")
hist(scores.N$AVG_MATH_8_SCORE,main = "",xlab = "NORTH",col = "light blue")
hist(scores.SE$AVG_MATH_8_SCORE,main = "",xlab = "SOUTHEAST",col = "light blue")
hist(scores.SW$AVG_MATH_8_SCORE,main = "",xlab = "SOUTHWEST",col = "light blue")
hist(scores.W$AVG_MATH_8_SCORE,main = "",xlab = "WEST",col = "light blue")

title("Histograms of AVG_MATH_8_SCORE by REGION", outer = TRUE, cex = 1.5)
par(mfrow = c(2,3), oma = c(0,0,2,0))

#q-q plots
qqnorm(scores.M$AVG_MATH_8_SCORE,main = "",ylab = "Midwest")
qqline(scores.M$AVG_MATH_8_SCORE, col = "red")

qqnorm(scores.N$AVG_MATH_8_SCORE,main = "",ylab = "Northeast")
qqline(scores.N$AVG_MATH_8_SCORE, col = "red")

qqnorm(scores.SE$AVG_MATH_8_SCORE,main = "",ylab = "Southeast")
qqline(scores.SE$AVG_MATH_8_SCORE, col = "red")

qqnorm(scores.SW$AVG_MATH_8_SCORE,main = "",ylab = "Southwest")
qqline(scores.SW$AVG_MATH_8_SCORE, col = "red")

qqnorm(scores.W$AVG_MATH_8_SCORE,main = "",ylab = "West")
qqline(scores.W$AVG_MATH_8_SCORE, col = "red")

title("Normal QQ Plots of ACT scores by language", outer = TRUE, cex = 1.5)

#We assume its independent
#The data is normal
#Variance is slight everywhere except in the southeast.

#Calculate
model.act <- aov(AVG_MATH_8_SCORE ~ REGION, data = scores.stacked)
summary.aov(model.act)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## REGION        4  16152    4038   54.22 <2e-16 ***
## Residuals   476  35451      74                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 799 observations deleted due to missingness
#Conclude
#p=0 --> p<alpha this means that there is at least on pair of means who are different.

# Question 11. Make pairwise comparisons, and answer the second research question. Use an overall significance level of αoverall=0.05.
pairwise.t.test(x = scores.stacked$AVG_MATH_8_SCORE, g = scores.stacked$REGION, p.adjust.method = "bonferroni")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  scores.stacked$AVG_MATH_8_SCORE and scores.stacked$REGION 
## 
##           MIDWEST NORTHEAST SOUTHEAST SOUTHWEST
## NORTHEAST 1.0000  -         -         -        
## SOUTHEAST < 2e-16 < 2e-16   -         -        
## SOUTHWEST 5.1e-09 8.1e-09   0.4591    -        
## WEST      0.0025  0.0027    1.0e-13   0.0040   
## 
## P value adjustment method: bonferroni
#if p<.05 reject otherwise...
#fail to reject--> no statistic significance that there is a difference in means. 

#Midwest
#H0:  ??M?????N=0   fail to reject
#H0:  ??M?????SE=0  reject
#H0:  ??M?????SW=0  reject
#H0:  ??M?????W=0   reject
#Northeast
#H0:  ??N?????SE=0  reject
#H0:  ??N?????SW=0  reject
#H0:  ??N?????W=0   reject
#Southeast
#H0:  ??SE?????SW=0  fail to reject
#H0:  ??SE?????W=0   reject
#Southwest
#H0:  ??SW?????W=0   reject

#12. Propose a possible explanation for your findings.
#From the boxplot created above we can see that the mean lines odf the midwest and northeast are close to each other, same relationship but below the overall mean have the southern regions. This is the reason they failed to reject the null. Possible reasons for such results can be the quality of math teachers in that region, student's motivation, student's focus in school etc.

#Documentation: C2C Sermipong helped me on questions 7, 10 and 11. I also looked at the posts on teams regarding the lab, some people gave advice on how to do the questions.