Analysis Project 1

Formatting

I have re-named the variables in the original data-set as follows * SES: Socio-economic is coded with numeric values of 1 to 3 representing Low, Middle, and High SES.
* reading: Reading test
* writing: Writing assessment
* maths: Math test
* science: General science test
* civics: Civics knowledge test
* locus: Locus of control
* concept: Self-concept
* motiv: Academic motivation

Descriptive Statistics

In this step, I am checking the descriptives of the variables as well as their distribution and normality.

str(my_data)

## 'data.frame':    90 obs. of  9 variables:
##  $ SES    : Factor w/ 3 levels "1","2","3": 1 1 1 2 2 2 1 1 2 1 ...
##  $ reading: num  33.6 46.9 41.6 38.9 36.3 49.5 62.7 44.2 46.9 44.2 ...
##  $ writing: num  43.7 35.9 59.3 41.1 48.9 46.3 64.5 51.5 41.1 49.5 ...
##  $ math   : num  40.2 41.9 41.9 32.7 39.5 46.2 48 36.9 45.3 40.5 ...
##  $ science: num  39 36.3 44.4 41.7 41.7 41.7 63.4 49.8 47.1 39 ...
##  $ civics : num  40.6 45.6 45.6 40.6 45.6 35.6 55.6 55.6 55.6 50.6 ...
##  $ locus  : num  0.29 -0.42 0.71 0.06 0.22 0.46 0.44 0.68 0.06 0.05 ...
##  $ concept: num  0.88 0.03 0.03 0.03 -0.28 0.03 -0.47 0.25 0.56 0.15 ...
##  $ motiv  : num  0.67 0.33 0.67 0 0 0 0.33 1 0.33 1 ...

my_data %>% group_by(SES) %>% 
  summarise(n = n(),
            reading_m = mean(reading), reading_sd = sd(reading),
            writing_m = mean(writing), writing_sd = sd(writing),
            maths_m = mean(math), maths_sd = sd(math),
            science_m = mean(science), science_sd = sd(science),
            civics_m = mean(civics), civics_sd = sd(civics),
            locus_m = mean(locus), locus_sd = sd(locus),
            concept_m = mean(concept), concept_sd = sd(concept),
            motiv_m = mean(motiv), motiv_sd = sd(motiv)) %>%
  kable(
    col.names = c("SES", "*n*", "*M*", "*SD*", "*M*", "*SD*", "*M*", "*SD*", 
                  "*M*", "*SD*", "*M*", "*SD*", "*M*", "*SD*", "*M*", "*SD*", 
                  "*M*", "*SD*"),
    caption = "Means and Standard Deviations of Academic Performance by SES Categories",
    digits = 2
  ) %>% 
  kable_styling()

## `summarise()` ungrouping output (override with `.groups` argument)

Means and Standard Deviations of Academic Performance by SES Categories
SES	n	M	SD	M	SD	M	SD	M	SD	M	SD	M	SD	M	SD	M	SD
1	30	44.18	9.24	45.59	11.25	45.67	8.35	42.75	8.75	45.35	8.61	-0.15	0.70	-0.21	0.71	0.56	0.39
2	30	44.43	7.70	45.69	7.45	44.49	8.15	43.30	8.48	46.47	7.27	-0.23	0.78	0.05	0.76	0.53	0.38
3	30	56.04	9.86	55.26	9.71	56.50	10.69	53.68	10.88	54.77	10.50	0.17	0.50	0.18	0.68	0.71	0.35

Descriptives show that there are equal numbers of students in each SES group (n=30 in each), with the means of all variables being highest in the high SES group. Additionally, locus of control is negative in both low and medium SES groups, and self-concept is negative in low SES groups. In all, means looks roughly similar for low and medium SES groups, with the obvious difference being among high SES students. Let's explore this further!

shapiro.test(my_data$reading)

## 
##  Shapiro-Wilk normality test
## 
## data:  my_data$reading
## W = 0.96172, p-value = 0.009525

shapiro.test(my_data$writing)

## 
##  Shapiro-Wilk normality test
## 
## data:  my_data$writing
## W = 0.96771, p-value = 0.02445

shapiro.test(my_data$math)

## 
##  Shapiro-Wilk normality test
## 
## data:  my_data$math
## W = 0.93676, p-value = 0.0002783

shapiro.test(my_data$science)

## 
##  Shapiro-Wilk normality test
## 
## data:  my_data$science
## W = 0.95458, p-value = 0.003243

shapiro.test(my_data$civics)

## 
##  Shapiro-Wilk normality test
## 
## data:  my_data$civics
## W = 0.96203, p-value = 0.00999

shapiro.test(my_data$locus)

## 
##  Shapiro-Wilk normality test
## 
## data:  my_data$locus
## W = 0.97126, p-value = 0.04336

shapiro.test(my_data$motiv)

## 
##  Shapiro-Wilk normality test
## 
## data:  my_data$motiv
## W = 0.83226, p-value = 1.051e-08

g <- ggplot(my_data, aes(x=reading, stat(density)))  
g + geom_histogram(binwidth=1, color="black", fill="white") + 
  geom_density() + 
  ggtitle("Normality of Reading") +
  xlab("Reading") + ylab("Density")

g <- ggplot(my_data, aes(x=writing, stat(density)))  
g + geom_histogram(binwidth=1, color="black", fill="white") + 
  geom_density() + 
  ggtitle("Normality of Writing") +
  xlab("Writing") + ylab("Density")

g <- ggplot(my_data, aes(x=math, stat(density)))  
g + geom_histogram(binwidth=1, color="black", fill="white") + 
  geom_density() + 
  ggtitle("Normality of Math") +
  xlab("Math") + ylab("Density")

g <- ggplot(my_data, aes(x=science, stat(density)))  
g + geom_histogram(binwidth=1, color="black", fill="white") + 
  geom_density() + 
  ggtitle("Normality of Science") +
  xlab("Science") + ylab("Density")

g <- ggplot(my_data, aes(x=civics, stat(density)))  
g + geom_histogram(binwidth=1, color="black", fill="white") + 
  geom_density() + 
  ggtitle("Normality of Civics") +
  xlab("Civics") + ylab("Density")

g <- ggplot(my_data, aes(x=locus, stat(density)))  
g + geom_histogram(binwidth=1, color="black", fill="white") + 
  geom_density() + 
  ggtitle("Normality of locus") +
  xlab("locus") + ylab("Density")

g <- ggplot(my_data, aes(x=concept, stat(density)))  
g + geom_histogram(binwidth=1, color="black", fill="white") + 
  geom_density() + 
  ggtitle("Normality of concept") +
  xlab("concept") + ylab("Density")

g <- ggplot(my_data, aes(x=motiv, stat(density)))  
g + geom_histogram(binwidth=1, color="black", fill="white") + 
  geom_density() + 
  ggtitle("Normality of Motiv") +
  xlab("Motiv") + ylab("Density")

#qqplots
ggqqplot(my_data$reading)

ggqqplot(my_data$writing)

ggqqplot(my_data$math)

ggqqplot(my_data$science)

ggqqplot(my_data$civics)

ggqqplot(my_data$locus)

ggqqplot(my_data$concept)

ggqqplot(my_data$motiv)

ggboxplot(my_data, 
          x = "SES", 
          y = c("reading", "writing", "math", "science", "civics", "locus", 
                "concept", "motiv"), 
          xlab = "SES",
          ylab = "Average Score",
          merge = TRUE)

The results from the Shapiro-Wilks normality test show that all variables are non-normally distributes, as all p values are less than .05. This is further observed in the bar graphs of the variables showing their distribution. However, given the patterns in the QQ plots, it seems like the data are not too far from normal, as most fall within the straight line pattern. The most concerning variables judging from QQ plots are the self-concept and academic motivation.

Boxplots of each variable by SES category which shows many outliers in the low SES group, and 4 outliers in both the middle and high SES groups. We can also see that the means for the 5 academic variables are higher for the high SES group, and that these variables have a wide range in the high SES group.

MANOVA Analysis

To test for significant differences among test variables as a function of SES, a MANOVA should be conducted. I am running Wilks’ lambda, Pillai’s trace, Hotelling’s trace, and Roy’s largest root tests because SAS gives values for each, but will be reporting Pillai's trace due to its robustness.

res.man <- manova(cbind(reading,writing,math,science,civics,locus,concept,motiv) ~ SES, data = my_data)
summary(res.man, test= "Wilks")

##           Df   Wilks approx F num Df den Df    Pr(>F)    
## SES        2 0.59476   2.9667     16    160 0.0002491 ***
## Residuals 87                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(res.man, test= "Pillai")

##           Df Pillai approx F num Df den Df    Pr(>F)    
## SES        2 0.4215   2.7037     16    162 0.0007916 ***
## Residuals 87                                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(res.man, test= "Roy")

##           Df     Roy approx F num Df den Df    Pr(>F)    
## SES        2 0.60911   6.1673      8     81 3.363e-06 ***
## Residuals 87                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(res.man, test= "Hotelling-Lawley")

##           Df Hotelling-Lawley approx F num Df den Df   Pr(>F)    
## SES        2          0.65401   3.2292     16    158 7.81e-05 ***
## Residuals 87                                                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As can be seen from the results of the Pilla trace test, there is a statistically significant difference of test scores as a function of SES group, F (2, 87)= 2.70, p<.001. Despite different F scores, all other tests indicate significant differences as well. Thus, we will conduct univariate follow-up tests to identify the dependent variables that these SES groups differ on.

summary.aov(res.man)

##  Response reading :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## SES          2 2752.8  1376.4  17.078 5.542e-07 ***
## Residuals   87 7011.9    80.6                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response writing :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## SES          2 1851.0  925.51  10.046 0.0001188 ***
## Residuals   87 8015.2   92.13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response math :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## SES          2 2630.1 1315.05  15.752 1.452e-06 ***
## Residuals   87 7263.4   83.49                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response science :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## SES          2 2272.3 1136.14  12.769 1.373e-05 ***
## Residuals   87 7740.9   88.98                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response civics :
##             Df Sum Sq Mean Sq F value    Pr(>F)    
## SES          2 1590.0  794.99  10.058 0.0001176 ***
## Residuals   87 6876.7   79.04                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response locus :
##             Df Sum Sq Mean Sq F value  Pr(>F)  
## SES          2  2.672 1.33603  2.9779 0.05611 .
## Residuals   87 39.032 0.44864                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response concept :
##             Df Sum Sq Mean Sq F value Pr(>F)
## SES          2  2.362 1.18105  2.2944 0.1069
## Residuals   87 44.784 0.51476               
## 
##  Response motiv :
##             Df  Sum Sq Mean Sq F value Pr(>F)
## SES          2  0.5637 0.28185  2.0487 0.1351
## Residuals   87 11.9692 0.13758

Looking at the univariate tests, we see significant differnces as a function of SES for all variables with the exception of locus of control, self-cocnept, and academic motivation. More specifically, the three groups show statistically significant differences in Reading F (2, 87) = 17.08, p < 0.001; Writing F (2, 87) = 10.05, p < 0.001; Maths F (2, 87) = 15.75, p < 0.001; Science F (2, 87) = 12.77, p < 0.001; and Civics F (2, 87) = 10.06, p < 0.001. Despite differences emerging in these academic domains, there were no statistically significant differences for the three trait measures, locus of control F (2, 87) = 2.98, p = 0.06), self-concept F (2, 87) = 2.29, p = 0.11, and academic motivation F (2, 87) = 2.05, p = 0.14.

Discriminant Function Analysis.

Next, a Linear Discriminant Analysis with Jacknifed Prediction will be conducted as a DFA to examine the munltivariate differences among groups.

fit<-lda(SES ~ reading + writing + math + science + civics + locus + concept + motiv, data=my_data)
fit

## Call:
## lda(SES ~ reading + writing + math + science + civics + locus + 
##     concept + motiv, data = my_data)
## 
## Prior probabilities of groups:
##         1         2         3 
## 0.3333333 0.3333333 0.3333333 
## 
## Group means:
##    reading  writing     math  science   civics      locus     concept     motiv
## 1 44.18000 45.59333 45.67000 42.75333 45.34667 -0.1490000 -0.20633333 0.5553333
## 2 44.43333 45.68667 44.48667 43.30333 46.46667 -0.2273333  0.04633333 0.5336667
## 3 56.03667 55.26000 56.50000 53.67667 54.77000  0.1710000  0.18500000 0.7113333
## 
## Coefficients of linear discriminants:
##                   LD1           LD2
## reading  0.0379762506 -0.0032566994
## writing -0.0023201993 -0.0005631982
## math     0.0540853731  0.0701305063
## science  0.0008793068 -0.0331192287
## civics   0.0347469431 -0.0519074476
## locus    0.3047745632  0.3661632633
## concept  0.6349853064 -1.1370509164
## motiv   -0.0699456247  1.5227526138
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9314 0.0686

plot(fit)

#predictions
predictions <- fit %>% predict(my_data)
names(predictions)

## [1] "class"     "posterior" "x"

#plot LDA 
lda.data <- cbind(my_data, predict(fit)$x)
ggplot(lda.data, aes(LD1, LD2)) +
  geom_point(aes(color = SES))

As we can see by the plot of LD1 versus LD2 as a function of SES, there is significant overlap with where individuals from low and middle SES fall, with high SES students seeming to form their own cluster higher on LD1 and off toward the right. Looking a the proportions of trace, we see that the first discriminant function is strongly related to SES group membership, with 93% of the variability between groups being related to DF1. Given this proportion of variance, our attention should be focused on the coefficients of LD1.

Writing and academic motivation both have negative weights, with all other variables having positive weights. However, not all correlations and weights are far away from 0. Using 0.3 as a cut-off as was suggested in lecture, we should only focus on locus of control and self-concept.

Quite honestly, I had trouble interpreting the DFA past this point. I was not able to figure out R code for standarized coefficients or eigenvalues, so I also computer Tukey follow-up tests to identify which groups differ significantly on each DV.

Tukey Follow-ups Test

reading_post<- aov(data = my_data, reading ~ SES)
reading_tukey <- TukeyHSD(x = reading_post, "SES", conf.level=0.95)
reading_tukey

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = reading ~ SES, data = my_data)
## 
## $SES
##           diff       lwr       upr     p adj
## 2-1  0.2533333 -5.273881  5.780548 0.9934371
## 3-1 11.8566667  6.329452 17.383881 0.0000055
## 3-2 11.6033333  6.076119 17.130548 0.0000085

plot(reading_tukey)

writing_post <- aov(data = my_data, writing ~ SES)
writing_tukey <- TukeyHSD(x = writing_post, "SES", conf.level=0.95)
writing_tukey

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = writing ~ SES, data = my_data)
## 
## $SES
##           diff       lwr       upr     p adj
## 2-1 0.09333333 -5.816090  6.002757 0.9992184
## 3-1 9.66666667  3.757243 15.576090 0.0005484
## 3-2 9.57333333  3.663910 15.482757 0.0006249

plot(writing_tukey)

math_post<- aov(data = my_data, math ~ SES)
math_tukey <- TukeyHSD(x = math_post, "SES", conf.level=0.95)
math_tukey

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = math ~ SES, data = my_data)
## 
## $SES
##          diff       lwr       upr     p adj
## 2-1 -1.183333 -6.808788  4.442122 0.8707406
## 3-1 10.830000  5.204545 16.455455 0.0000437
## 3-2 12.013333  6.387878 17.638788 0.0000060

plot(math_tukey)

science_post <- aov(data = my_data, science ~ SES)
science_tukey <- TukeyHSD(x = science_post, "SES", conf.level=0.95)
science_tukey

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = science ~ SES, data = my_data)
## 
## $SES
##         diff       lwr       upr     p adj
## 2-1  0.55000 -5.257423  6.357423 0.9722872
## 3-1 10.92333  5.115910 16.730756 0.0000654
## 3-2 10.37333  4.565910 16.180756 0.0001519

plot(science_tukey)

civics_post <- aov(data = my_data, civics ~ SES)
civics_tukey <- TukeyHSD(x = civics_post, "SES", conf.level=0.95)
civics_tukey

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = civics ~ SES, data = my_data)
## 
## $SES
##         diff       lwr       upr     p adj
## 2-1 1.120000 -4.353653  6.593653 0.8772358
## 3-1 9.423333  3.949680 14.896986 0.0002660
## 3-2 8.303333  2.829680 13.776986 0.0014355

plot(civics_tukey)

Looking at the results from the reading post-hoc as well as the graph, we see significant differences between low and high SES students, as well as between middle and high SES students(ps<.001). The same pattern emerges for writing, math, science, and civics, with p values all being <.01. These results show that low and medium level SES students do not differ in their academic performance, but both groups significantly differ from high SES students.

Conclusion

These data suggest statistically significant differences in academic related performance as a function of a student's socio-economic status. However, individual difference variable such as self-concept, academic motivation, and locus of control do not relate to SES. It may be that the benefits accompanied by higher SES, such as access to financial resources, tutoring, and better schools are driving these results, rather than inherent differences in intelligence as a function of family social or economic status.