## Warning: package 'gridExtra' was built under R version 3.2.5

Part 1 - Introduction:

Research Question:

Null hypothesis: There is no statistically significant correlation between scores on the PISA Educational Assessment and the percentage of GDP per capital spent on education.

Alternative Hypothesis: There is statistical evidence that the GDP spent per capital has influence on PISA scores.

Part 2 - Data:

What are the cases and how many are there/ Data Source:

Describe the method of data collection:

What type of study is this (observational/experiment)?

What is the Response Variable?

What is the Explanatory Variable?

Scope of Inference

Initial Data Manipulation

After initially looking at all Pisa assessment results for which we have a corresponding GDP spending figure we can now look more closely at the OECD member countries to see if there is a large difference.

Part 3 - Exploratory data analysis:

All Countries

x <- GDP %>% arrange(desc(gdp))
head(x)
##   Country.Code  year      gdp
## 1          LTU X2012 70.64836
## 2          SRB X2012 50.95265
## 3          MDA X2012 39.31168
## 4          CYP X2012 32.57038
## 5          UKR X2012 32.38915
## 6          TLS X2012 31.89363
rm(x)
#summary of GDP measures
summary(scienceA$gdp) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   9.221  17.120  20.560  21.780  23.810  70.650
#summary of the various scores
summary(scienceA$score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   373.1   445.7   495.7   482.2   515.5   554.9
summary(mathA$score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   368.1   448.9   489.8   476.0   505.5   561.2
summary(readingA$score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   384.2   443.9   488.5   476.7   508.4   544.6
a1<- ggplot(scienceA, aes(score)) + geom_histogram(binwidth=10, color='dodgerblue1') + ggtitle('Science All Countries')
a2<- ggplot(mathA, aes(score)) + geom_histogram(binwidth=10, color='dodgerblue1') + ggtitle('Math All Countries')
a3<- ggplot(readingA, aes(score)) + geom_histogram(binwidth=10, color='dodgerblue1') + ggtitle('Reading All Countries')
a4 <- ggplot(scienceA, aes(gdp)) + geom_histogram(binwidth=2, color='green') + ggtitle('GDP All Countries')
grid.arrange(a1, a2, a3, a4)

OECD Member Countries Only

#summary of GDP measures
summary(science$gdp) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.60   18.84   20.71   21.32   23.82   29.95
#summary of the various scores
summary(science$score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   414.9   491.8   501.9   501.5   520.0   546.7
summary(math$score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   413.3   484.6   497.0   494.2   511.5   536.4
summary(reading$score)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   423.6   487.8   496.9   495.6   510.7   538.1
p1<- ggplot(science, aes(score)) + geom_histogram(binwidth=5, color='dodgerblue1') + ggtitle('science')
p2<- ggplot(math, aes(score)) + geom_histogram(binwidth=5, color='dodgerblue1') + ggtitle('math')
p3<- ggplot(reading, aes(score)) + geom_histogram(binwidth=5, color='dodgerblue1') + ggtitle('reading')
p4 <- ggplot(science, aes(gdp)) + geom_histogram(binwidth=1, color='green') + ggtitle('gdp')
grid.arrange(p1, p2, p3, p4)

Part 4 - Inference:

*The residuals are nearly normal

*I don’t believe that the data meets the condition for roughly constant variability around the least squares line.

# This plot is for a visual only because each test score has three scores associated with it.  To do linear regression we need to break out each score.
ggplot(combined) + aes(x = gdp, y = score, color = Indicator.Code) + geom_point()

All Countries in Sample

g1 <- ggplot(mathA) + aes(x = gdp, y = score) + geom_point(color = 'dodgerblue2') + ggtitle('math') + geom_smooth(method = "lm", se = FALSE)


g2 <- ggplot(scienceA) + aes(x = gdp, y = score) + geom_point(color = 'red') + ggtitle('science')+ geom_smooth(method = "lm", se = FALSE)


g3 <- ggplot(readingA) + aes(x = gdp, y = score) + geom_point(color = 'green') + ggtitle('reading') + geom_smooth(method = "lm", se = FALSE)
grid.arrange(g1, g2, g3)

a1 <- lm(score ~ gdp, data = mathA)
summary(a1)
## 
## Call:
## lm(formula = score ~ gdp, data = mathA)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96.31 -42.39  13.16  31.36  92.23 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 455.8906    17.6851  25.778   <2e-16 ***
## gdp           0.9246     0.7407   1.248    0.219    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48.55 on 43 degrees of freedom
## Multiple R-squared:  0.03497,    Adjusted R-squared:  0.01253 
## F-statistic: 1.558 on 1 and 43 DF,  p-value: 0.2187
a2 <- lm(score ~ gdp, data = scienceA)
summary(a2)
## 
## Call:
## lm(formula = score ~ gdp, data = scienceA)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -98.51 -33.52  14.49  31.41  79.11 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 463.8244    16.5292  28.061   <2e-16 ***
## gdp           0.8459     0.6918   1.223    0.228    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.59 on 45 degrees of freedom
## Multiple R-squared:  0.03215,    Adjusted R-squared:  0.01065 
## F-statistic: 1.495 on 1 and 45 DF,  p-value: 0.2278
a3 <- lm(score ~ gdp, data = readingA)
summary(a3)
## 
## Call:
## lm(formula = score ~ gdp, data = readingA)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -83.76 -34.31  13.34  31.09  73.22 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 461.4919    15.0602  30.643   <2e-16 ***
## gdp           0.6965     0.6303   1.105    0.275    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.45 on 45 degrees of freedom
## Multiple R-squared:  0.02642,    Adjusted R-squared:  0.004783 
## F-statistic: 1.221 on 1 and 45 DF,  p-value: 0.275

Model Diagnostics

r1 <- ggplot(mathA) + aes(x = gdp, y = a1$residuals) + geom_point(color = 'dodgerblue2') + ggtitle('math') + geom_smooth(method = "lm", se = FALSE)


r2 <- ggplot(scienceA) + aes(x = gdp, y = a2$residuals) + geom_point(color = 'red') + ggtitle('science')+ geom_smooth(method = "lm", se = FALSE)


r3 <- ggplot(readingA) + aes(x = gdp, y = a3$residuals) + geom_point(color = 'green') + ggtitle('reading') + geom_smooth(method = "lm", se = FALSE)
grid.arrange(r1, r2, r3)


s1<- ggplot(scienceA, aes(a2$residuals)) + geom_histogram(binwidth=15, color='dodgerblue1') + ggtitle('science')
s2<- ggplot(mathA, aes(a1$residuals)) + geom_histogram(binwidth=15, color='dodgerblue1') + ggtitle('math')
s3<- ggplot(readingA, aes(a3$residuals)) + geom_histogram(binwidth=15, color='dodgerblue1') + ggtitle('reading')

grid.arrange(s1, s2, s3)

qqnorm(a1$residuals) 
qqline(a1$residuals)

qqnorm(a2$residuals) 
qqline(a2$residuals)

qqnorm(a3$residuals) 
qqline(a3$residuals)

OECD Member Countries

x1 <- ggplot(math) + aes(x = gdp, y = score) + geom_point(color = 'dodgerblue2') + ggtitle('math') + geom_smooth(method = "lm", se = FALSE)


x2 <- ggplot(science) + aes(x = gdp, y = score) + geom_point(color = 'red') + ggtitle('science')+ geom_smooth(method = "lm", se = FALSE)


x3 <- ggplot(reading) + aes(x = gdp, y = score) + geom_point(color = 'green') + ggtitle('reading') + geom_smooth(method = "lm", se = FALSE)
grid.arrange(x1, x2, x3)

m1 <- lm(score ~ gdp, data = math)
summary(m1)
## 
## Call:
## lm(formula = score ~ gdp, data = math)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.312 -10.750  -0.206  17.678  40.213 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  435.241     28.091  15.494 2.89e-15 ***
## gdp            2.764      1.300   2.126   0.0425 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25.16 on 28 degrees of freedom
## Multiple R-squared:  0.139,  Adjusted R-squared:  0.1082 
## F-statistic: 4.519 on 1 and 28 DF,  p-value: 0.04248
m2 <- lm(score ~ gdp, data = science)
summary(m2)
## 
## Call:
## lm(formula = score ~ gdp, data = science)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -70.902 -10.098  -0.001  18.229  45.530 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  451.700     30.255  14.930 7.34e-15 ***
## gdp            2.337      1.400   1.669    0.106    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27.1 on 28 degrees of freedom
## Multiple R-squared:  0.09048,    Adjusted R-squared:  0.058 
## F-statistic: 2.785 on 1 and 28 DF,  p-value: 0.1063
m3 <- lm(score ~ gdp, data = reading)
summary(m3)
## 
## Call:
## lm(formula = score ~ gdp, data = reading)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -60.65 -10.86   1.46  14.77  38.26 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  459.441     25.699  17.878   <2e-16 ***
## gdp            1.696      1.189   1.426    0.165    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.02 on 28 degrees of freedom
## Multiple R-squared:  0.06769,    Adjusted R-squared:  0.03439 
## F-statistic: 2.033 on 1 and 28 DF,  p-value: 0.165

Model Diagnostics

t1 <- ggplot(math) + aes(x = gdp, y = m1$residuals) + geom_point(color = 'dodgerblue2') + ggtitle('math') + geom_smooth(method = "lm", se = FALSE)


t2 <- ggplot(science) + aes(x = gdp, y = m2$residuals) + geom_point(color = 'red') + ggtitle('science')+ geom_smooth(method = "lm", se = FALSE)


t3 <- ggplot(reading) + aes(x = gdp, y = m3$residuals) + geom_point(color = 'green') + ggtitle('reading') + geom_smooth(method = "lm", se = FALSE)
grid.arrange(t1, t2, t3)



u1<- ggplot(math, aes(m1$residuals)) + geom_histogram(binwidth=15, color='dodgerblue1') + ggtitle('math')

u2<- ggplot(science, aes(m2$residuals)) + geom_histogram(binwidth=15, color='dodgerblue1') + ggtitle('science')

u3<- ggplot(reading, aes(m3$residuals)) + geom_histogram(binwidth=15, color='dodgerblue1') + ggtitle('reading')

grid.arrange(u1, u2, u3)

qqnorm(m1$residuals) 
qqline(m1$residuals)

qqnorm(m2$residuals) 
qqline(m2$residuals)

qqnorm(m3$residuals) 
qqline(m3$residuals)

Part 5 - Conclusion:

References:

The World Bank GDP The World Bank PISA OECD OpenIntro