##Is there a correlation between vocabulary and gender, education level and year within data set?
library(ggplot2)
Vocab <- read.csv('https://raw.githubusercontent.com/engine2031/Winter-Bridge-R-2021/main/Vocab.csv')
summary(Vocab)
## X year sex education
## Min. :19740001 Min. :1974 Length:30351 Min. : 0.00
## 1st Qu.:19870112 1st Qu.:1987 Class :character 1st Qu.:12.00
## Median :19942104 Median :1994 Mode :character Median :12.00
## Mean :19954597 Mean :1995 Mean :13.03
## 3rd Qu.:20063676 3rd Qu.:2006 3rd Qu.:15.00
## Max. :20162866 Max. :2016 Max. :20.00
## vocabulary
## Min. : 0.000
## 1st Qu.: 5.000
## Median : 6.000
## Mean : 6.004
## 3rd Qu.: 7.000
## Max. :10.000
mean(Vocab$education)
## [1] 13.03423
median(Vocab$education)
## [1] 12
mean(Vocab$vocabulary)
## [1] 6.003657
median(Vocab$vocabulary)
## [1] 6
names(Vocab)[1] <- "ID"
names(Vocab)[2] <- "Year"
names(Vocab)[3] <- "Sex"
names(Vocab)[4] <- "Education_Level"
names(Vocab)[5] <- "Vocabulary_Level"
VocabMen <- subset(Vocab, Sex == "Male", select=c( Year, Education_Level, Sex, Vocabulary_Level))
VocabFemale <- subset(Vocab, Sex == "Female", select=c( Year, Education_Level, Sex, Vocabulary_Level))
summary(VocabMen)
## Year Education_Level Sex Vocabulary_Level
## Min. :1974 Min. : 0.00 Length:13203 Min. : 0.000
## 1st Qu.:1987 1st Qu.:12.00 Class :character 1st Qu.: 5.000
## Median :1994 Median :13.00 Mode :character Median : 6.000
## Mean :1995 Mean :13.18 Mean : 5.956
## 3rd Qu.:2006 3rd Qu.:16.00 3rd Qu.: 7.000
## Max. :2016 Max. :20.00 Max. :10.000
summary(VocabFemale)
## Year Education_Level Sex Vocabulary_Level
## Min. :1974 Min. : 0.00 Length:17148 Min. : 0.00
## 1st Qu.:1987 1st Qu.:12.00 Class :character 1st Qu.: 5.00
## Median :1994 Median :12.00 Mode :character Median : 6.00
## Mean :1995 Mean :12.92 Mean : 6.04
## 3rd Qu.:2006 3rd Qu.:15.00 3rd Qu.: 7.00
## Max. :2016 Max. :20.00 Max. :10.00
ggplot(VocabMen, aes(y=Vocabulary_Level, x=Education_Level), group=factor(Year))+geom_point()
ggplot(VocabFemale, aes(y=Vocabulary_Level, x=Education_Level))+geom_point()
ggplot ( data =VocabMen) + geom_histogram ( aes ( x =Education_Level), bins=30)
ggplot ( data =VocabFemale) + geom_histogram ( aes ( x =Education_Level), bins=30)
ggplot ( data =VocabFemale) + geom_histogram ( aes ( x =Vocabulary_Level), bins=30)
ggplot ( data =VocabMen) + geom_histogram ( aes ( x =Vocabulary_Level), bins=30)
ggplot(VocabMen, aes(y=Vocabulary_Level, x=Year, group=factor(Year)))+geom_boxplot()
ggplot(VocabFemale, aes(y=Vocabulary_Level, x=Year,group=factor(Year)))+geom_boxplot()
##The scatter plot did not producing any inspiring data. It did not seems there was any specific correlation between education level and vocabulary level. In general this data was evenly distributed.
##Our histogram plot found that for both male and female subsets the majority had an education level between 10 to 16. The vocabulary histogram plot for both male and female followed a standard ditribution curve.
##The box plots for this data indicated that prior to the year 2000 the vocabulary levels of females in this data set were on overage slightly higher. After 2000 the vocabulary levels for both male and females were nearly identical.