# We retrieve data(Vacab) from the package "carData"
library(car)
## Loading required package: carData
library(carData)
data(Vocab)
knitr::kable(head(Vocab))
| 19740001 |
1974 |
Male |
14 |
9 |
| 19740002 |
1974 |
Male |
16 |
9 |
| 19740003 |
1974 |
Female |
10 |
9 |
| 19740004 |
1974 |
Female |
10 |
5 |
| 19740005 |
1974 |
Female |
12 |
8 |
| 19740006 |
1974 |
Male |
16 |
8 |
library(tibble)
# We save the dta as tibble
dta <- as_tibble(carData::Vocab)
# We learn the names of the variables
names(dta)
## [1] "year" "sex" "education" "vocabulary"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lattice)
# We choose the variable "sex=male" and display the first 6 rows
dta %>% dplyr::filter(sex == "Male") %>% head
## # A tibble: 6 x 4
## year sex education vocabulary
## <dbl> <fct> <dbl> <dbl>
## 1 1974 Male 14 9
## 2 1974 Male 16 9
## 3 1974 Male 16 8
## 4 1974 Male 17 9
## 5 1974 Male 10 5
## 6 1974 Male 6 5
# We choose the variable "sex=female" and display the first 6 rows
dta %>% dplyr::filter(sex == "Female") %>% head
## # A tibble: 6 x 4
## year sex education vocabulary
## <dbl> <fct> <dbl> <dbl>
## 1 1974 Female 10 9
## 2 1974 Female 10 5
## 3 1974 Female 12 8
## 4 1974 Female 12 3
## 5 1974 Female 11 5
## 6 1974 Female 11 6
# I separte the data to Male to Female
dta_m <- dta %>% dplyr::filter(sex == "Male")
dta_f <- dta %>% dplyr::filter(sex == "Female")
# We visualize raw data of education and vocabulary with xyplot and grouping on years.
lattice::xyplot(vocabulary ~ education, groups=year, data=dta_m, type=c("p","r"), auto.key=list(columns=6))

# 1. I create new variables by the interval of ten years.
# 2. I group by year and summarize the correlation.
# 3. Conclusion: For males, the correlation between education and vocabulary is positive and it gets weaker over the years. In Q1(year before 1980), the correlation is 0.57 while in Q4, the correlation is 0.46.
dta_m %>% mutate(year = case_when(
year < 1980 ~ "Q1",
year >= 1981 & year < 1990 ~ "Q2",
year >= 1991 & year < 2000 ~ "Q3",
year >= 2001 & year < 2010 ~ "Q4",year >= 2011 ~ "Q5")) %>%
group_by(year) %>%
summarise(cor = cor(education, vocabulary))
## # A tibble: 6 x 2
## year cor
## <chr> <dbl>
## 1 Q1 0.571
## 2 Q2 0.528
## 3 Q3 0.484
## 4 Q4 0.485
## 5 Q5 0.468
## 6 <NA> 0.492
# I do the same thing to females
lattice::xyplot(vocabulary ~ education, groups=year, data=dta_f, type=c("p","r"), auto.key=list(columns=6))

dta_f %>% mutate(year = case_when(
year < 1980 ~ "Q1",
year >= 1981 & year < 1990 ~ "Q2",
year >= 1991 & year < 2000 ~ "Q3",
year >= 2001 & year < 2010 ~ "Q4",year >= 2011 ~ "Q5")) %>%
group_by(year) %>%
summarise(cor = cor(education, vocabulary))
## # A tibble: 6 x 2
## year cor
## <chr> <dbl>
## 1 Q1 0.498
## 2 Q2 0.496
## 3 Q3 0.481
## 4 Q4 0.401
## 5 Q5 0.431
## 6 <NA> 0.472
# Conclusion: For females, the correlation between education and vocabulary is positive and it gets slightly weaker over the years. In Q1(year before 1980), the correlation is 0.49 while in Q4(year between 2001~2010), the correlation is 0.40. But the correlation gets stronger again in the year after 2010.