library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1       ✔ purrr   0.3.2  
## ✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(openintro)
## Please visit openintro.org for free statistics materials
## 
## Attaching package: 'openintro'
## The following object is masked from 'package:ggplot2':
## 
##     diamonds
## The following objects are masked from 'package:datasets':
## 
##     cars, trees
load(file="student.RData")

head(StudentsPerformance)
## # A tibble: 6 x 8
##   gender `race/ethnicity` `parental level… lunch `test preparati…
##   <chr>  <chr>            <chr>            <chr> <chr>           
## 1 female group B          bachelor's degr… stan… none            
## 2 female group C          some college     stan… completed       
## 3 female group B          master's degree  stan… none            
## 4 male   group A          associate's deg… free… none            
## 5 male   group C          some college     stan… none            
## 6 female group B          associate's deg… stan… none            
## # … with 3 more variables: `math score` <dbl>, `reading score` <dbl>,
## #   `writing score` <dbl>
data <- StudentsPerformance

data <- rename(data, race_eth = 'race/ethnicity')
data <- rename(data, parenteduc = 'parental level of education')
data <- rename(data, test_prep = 'test preparation course')
data <- rename(data, math_score = 'math score')
data <- rename(data, reading_score = 'reading score')
data <- rename(data, writing_score = 'writing score')


data$gender <- as.factor(data$gender)
data$race_eth <- as.factor(data$race_eth)
data$parenteduc <- as.factor(data$parenteduc)
data$test_prep <- as.factor(data$test_prep)
data$lunch <- as.factor(data$lunch)

data <- data %>%
  mutate(totalscore =math_score+reading_score+writing_score)

str(data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of  9 variables:
##  $ gender       : Factor w/ 2 levels "female","male": 1 1 1 2 2 1 1 2 2 1 ...
##  $ race_eth     : Factor w/ 5 levels "group A","group B",..: 2 3 2 1 3 2 2 2 4 2 ...
##  $ parenteduc   : Factor w/ 6 levels "associate's degree",..: 2 5 4 1 5 1 5 5 3 3 ...
##  $ lunch        : Factor w/ 2 levels "free/reduced",..: 2 2 2 1 2 2 2 1 1 1 ...
##  $ test_prep    : Factor w/ 2 levels "completed","none": 2 1 2 2 2 2 1 2 1 2 ...
##  $ math_score   : num  72 69 90 47 76 71 88 40 64 38 ...
##  $ reading_score: num  72 90 95 57 78 83 95 43 64 60 ...
##  $ writing_score: num  74 88 93 44 75 78 92 39 67 50 ...
##  $ totalscore   : num  218 247 278 148 229 232 275 122 195 148 ...

There is a clear divide between the score on each test and gender. Males tended to have a higher math score and a lower reading and writing score than females.

ggplot(data, aes(x=totalscore, y=math_score, color=gender)) + geom_point(alpha=0.5)

ggplot(data, aes(x=totalscore, y=reading_score, color=gender)) + geom_point(alpha=0.5)

ggplot(data, aes(x=totalscore, y=writing_score, color=gender)) + geom_point(alpha=0.5)

Looking at the data, female students averaged more points than males in all but math score.

genderresults <- data %>%
  group_by(gender) %>%
  summarise(mavg=mean(math_score), ravg=mean(reading_score), wavg=mean(writing_score), tavg=mean(totalscore))

genderresults
## # A tibble: 2 x 5
##   gender  mavg  ravg  wavg  tavg
##   <fct>  <dbl> <dbl> <dbl> <dbl>
## 1 female  63.6  72.6  72.5  209.
## 2 male    68.7  65.5  63.3  198.

The rest of the data shows that a majority of the other variables in the dataset are fairly even in both genders. Showing that there is not any other obvious variable that could explain why females are getting such higher scores in this dataset.

ggplot(data, aes(x=gender, fill=test_prep)) + geom_bar(position = 'dodge')

ggplot(data, aes(x=gender, fill=lunch)) + geom_bar(position = 'dodge')

ggplot(data, aes(x=gender, fill=parenteduc)) + geom_bar(position = 'dodge')

The only clear difference is in race and ethnicity. Females had a much higher number of students that fell under racial group C than males. Racial group C is the median average total score, likely bringing the female score up. The males also had a higher number of students in racial group A, which has the lowest average score, likely bringing their score down.

ggplot(data, aes(x=gender, fill=race_eth)) + geom_bar(position = 'dodge')

ggplot(data, aes(x=race_eth, y=totalscore)) + geom_boxplot()

Looking at the lunch variable, there appears to be some correlation between lunch and higher test scores.

ggplot(data, aes(x=totalscore, y=math_score, color=lunch)) + geom_point(alpha=0.5)

ggplot(data, aes(x=totalscore, y=reading_score, color=lunch)) + geom_point(alpha=0.5)

ggplot(data, aes(x=totalscore, y=writing_score, color=lunch)) + geom_point(alpha=0.5)

When only using those below or above one standard deviation it becomes clear that a majority of the best students are ones with standard lunch. Along with the lower scores still being a mix of students with standard or free/reduced lunch.

lunchdata <- data %>%
  group_by(lunch) %>%
  summarize(mavg=mean(math_score), ravg=mean(reading_score), wavg=mean(writing_score), tavg=mean(totalscore))

sd(data$totalscore)
## [1] 42.77198
mean(data$totalscore)
## [1] 203.312
lunchdatagraph <- data %>%
  group_by(lunch) %>%
  filter(totalscore < 161 | totalscore > 245)

ggplot(lunchdatagraph, aes(x=totalscore, y=math_score, color=lunch)) + geom_point(alpha=0.5)

ggplot(lunchdatagraph, aes(x=totalscore, y=reading_score, color=lunch)) + geom_point(alpha=0.5)

ggplot(lunchdatagraph, aes(x=totalscore, y=writing_score, color=lunch)) + geom_point(alpha=0.5)

Using linear regression to predict total score, gender and lunch were used, along with writing score because it had the largest difference in average score, between male and female compared to the other tests. Which resulted in an accurate model with an R-squared of 0.9635.

dataforreg <- data

nrows = nrow(data)

set.seed(999)

rows <- sample(nrows)
dataforreg = dataforreg[rows,]
split <- round(nrow(dataforreg) * .70)
train = dataforreg[1:split,]
test = dataforreg[(split+1):nrows,]

mod1 <- lm(totalscore ~ gender+writing_score+lunch, data=train)
summary(mod1)
## 
## Call:
## lm(formula = totalscore ~ gender + writing_score + lunch, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -23.3769  -5.0585  -0.1788   5.8125  22.5521 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.1477     1.5882   0.723     0.47    
## gendermale     14.6116     0.6552  22.301  < 2e-16 ***
## writing_score   2.8383     0.0222 127.861  < 2e-16 ***
## lunchstandard   2.8492     0.6796   4.193 3.11e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.296 on 696 degrees of freedom
## Multiple R-squared:  0.9635, Adjusted R-squared:  0.9634 
## F-statistic:  6129 on 3 and 696 DF,  p-value: < 2.2e-16
pred1 <- predict(mod1, newdata = test)
result <- cbind(test$totalscore, pred1)

plot(result)