##Is there a correlation between vocabulary and gender, education level and year within data set?

library(ggplot2)

Vocab <- read.csv('https://raw.githubusercontent.com/engine2031/Winter-Bridge-R-2021/main/Vocab.csv')


summary(Vocab)
##        X                 year          sex              education    
##  Min.   :19740001   Min.   :1974   Length:30351       Min.   : 0.00  
##  1st Qu.:19870112   1st Qu.:1987   Class :character   1st Qu.:12.00  
##  Median :19942104   Median :1994   Mode  :character   Median :12.00  
##  Mean   :19954597   Mean   :1995                      Mean   :13.03  
##  3rd Qu.:20063676   3rd Qu.:2006                      3rd Qu.:15.00  
##  Max.   :20162866   Max.   :2016                      Max.   :20.00  
##    vocabulary    
##  Min.   : 0.000  
##  1st Qu.: 5.000  
##  Median : 6.000  
##  Mean   : 6.004  
##  3rd Qu.: 7.000  
##  Max.   :10.000
mean(Vocab$education)
## [1] 13.03423
median(Vocab$education)
## [1] 12
mean(Vocab$vocabulary)
## [1] 6.003657
median(Vocab$vocabulary)
## [1] 6
names(Vocab)[1] <- "ID"
names(Vocab)[2] <- "Year"
names(Vocab)[3] <- "Sex"
names(Vocab)[4] <- "Education_Level"
names(Vocab)[5] <- "Vocabulary_Level"
VocabMen <- subset(Vocab, Sex == "Male", select=c( Year, Education_Level, Sex, Vocabulary_Level))
VocabFemale <- subset(Vocab, Sex == "Female", select=c( Year, Education_Level, Sex, Vocabulary_Level))
summary(VocabMen)
##       Year      Education_Level     Sex            Vocabulary_Level
##  Min.   :1974   Min.   : 0.00   Length:13203       Min.   : 0.000  
##  1st Qu.:1987   1st Qu.:12.00   Class :character   1st Qu.: 5.000  
##  Median :1994   Median :13.00   Mode  :character   Median : 6.000  
##  Mean   :1995   Mean   :13.18                      Mean   : 5.956  
##  3rd Qu.:2006   3rd Qu.:16.00                      3rd Qu.: 7.000  
##  Max.   :2016   Max.   :20.00                      Max.   :10.000
summary(VocabFemale)
##       Year      Education_Level     Sex            Vocabulary_Level
##  Min.   :1974   Min.   : 0.00   Length:17148       Min.   : 0.00   
##  1st Qu.:1987   1st Qu.:12.00   Class :character   1st Qu.: 5.00   
##  Median :1994   Median :12.00   Mode  :character   Median : 6.00   
##  Mean   :1995   Mean   :12.92                      Mean   : 6.04   
##  3rd Qu.:2006   3rd Qu.:15.00                      3rd Qu.: 7.00   
##  Max.   :2016   Max.   :20.00                      Max.   :10.00
ggplot(VocabMen, aes(y=Vocabulary_Level, x=Education_Level), group=factor(Year))+geom_point()

ggplot(VocabFemale, aes(y=Vocabulary_Level, x=Education_Level))+geom_point()

ggplot ( data =VocabMen) + geom_histogram ( aes ( x =Education_Level), bins=30)

ggplot ( data =VocabFemale) + geom_histogram ( aes ( x =Education_Level), bins=30)

ggplot ( data =VocabFemale) + geom_histogram ( aes ( x =Vocabulary_Level), bins=30)

ggplot ( data =VocabMen) + geom_histogram ( aes ( x =Vocabulary_Level), bins=30)

ggplot(VocabMen, aes(y=Vocabulary_Level, x=Year, group=factor(Year)))+geom_boxplot()

ggplot(VocabFemale, aes(y=Vocabulary_Level, x=Year,group=factor(Year)))+geom_boxplot()

##The scatter plot did not producing any inspiring data. It did not seems there was any specific correlation between education level and vocabulary level. In general this data was evenly distributed.

##Our histogram plot found that for both male and female subsets the majority had an education level between 10 to 16. The vocabulary histogram plot for both male and female followed a standard ditribution curve.

##The box plots for this data indicated that prior to the year 2000 the vocabulary levels of females in this data set were on overage slightly higher. After 2000 the vocabulary levels for both male and females were nearly identical.