options(warn=-1)
#Require Necessary library
library(psych)
library(ggplot2)
library(plyr)
#create user defined function
multiLines <- function(model, ...){
if(class(model)!="lm"){
warning("Model must be the output of the function lm()")
}
if(length(model$xlevels)!=1){
warning("Model must contain exactly one categorical predictor")
}
if(length(model$coef)-length(model$xlevels[[1]])!=1){
warning("Model must contain exactly one non-categorical predictor")
}
palette <- c("#E69F00", "#56B4E9", "#D55E00", "#009E73", "#CC79A7", "#F0E442", "#0072B2")
baseIntercept <- model$coef[1]
nLines <- length(model$xlevels[[1]])
intercepts <- c(baseIntercept, rep(0, nLines-1))
indicatorInd <- c(1, rep(0, nLines)) # used to find slope parameter by process of elimination
for(i in 1:(nLines-1)){
indicatorName <- paste(names(model$contrasts),model$xlevels[[1]][1+i], sep = "")
intercepts[i+1] <- baseIntercept + model$coef[names(model$coef)==indicatorName]
indicatorInd <- indicatorInd + (names(model$coef)==indicatorName)
}
slope <- model$coef[!indicatorInd]
num_pred = which(names(model$model[,-1]) != names(model$xlevels)) + 1
cat_pred = which(names(model$model[,-1]) == names(model$xlevels)) + 1
model$model$COL = NA
model$model$PCH = NA
for(i in 1:nLines){
model$model$COL[model$model[,cat_pred] == levels(model$model[,cat_pred])[i]] = adjustcolor(palette[i],0.40)
model$model$PCH[model$model[,cat_pred] == levels(model$model[,cat_pred])[i]] = i+14
}
plot(model$model[,1] ~ jitter(model$model[,num_pred]), col = model$model$COL, pch = model$model$PCH,
ylab = names(model$model)[1],
xlab = names(model$model)[num_pred])
for(j in 1:nLines){
abline(intercepts[j], slope, col = palette[j], lwd = 2, ...)
}
if(slope > 0){legend_pos = "bottomright"}
if(slope < 0){legend_pos = "topleft"}
legend(legend_pos, col = palette[1:nLines], lty = 1, legend = levels(model$model[,cat_pred]), lwd = 2)
}
# load data
divorce_df <- read.table("https://raw.githubusercontent.com/fivethirtyeight/data/master/marriage/divorce.csv",
header = TRUE, sep = ",")
#Preapare Data
#Format Dataframe in the form of 3 variables Divorce Rate(Response/Quantitative variable), Year(Explanatory/Quantitative variable) and Education level(Explanatory/Qualitative variable). Will will keep age constant at range 35-44 for this analysis
divorce_sub <- divorce_df[c('HS_3544','BAp_3544','year')]
colnames(divorce_sub) <- c('High_school','Bachelors_degree_more','year')
divorce <- divorce_df[c('HS_3544','year')]
Education_level <- rep('High_school',nrow(divorce_df))
divorce_rate <- cbind(divorce,Education_level)
colnames(divorce_rate)[1] <- 'Divorce_rate'
divorce_1 <- divorce_df[c('BAp_3544','year')]
Education_level_1 <- rep('Bachelors_degree_more',nrow(divorce_df))
divorce_rate_1 <- cbind(divorce_1,Education_level_1)
colnames(divorce_rate_1)[c(1,3)] <- c('Divorce_rate','Education_level')
divorce_rate_df <- rbind(divorce_rate,divorce_rate_1)
In this project I will analyze any possible correlation between education level, year and with rate of divorce. Most of us know that in recent years the divorce rates has been increasing, but I would like to know if the education level of couples affect it. I would be using two categories of educational level in this our analysis. The first group will be couples that possess high school diploma as their highest level of education. The second group will be couples that possess bachelors degree or higher as their highest level of education
Data was gotten from from fivethirtyeight website. They described that their data was collected from American Community Survey (years 2001-2012), via IPUMS USA. Raw file link https://github.com/fivethirtyeight/data/tree/master/marriage After loading the data into a data frame. I went on to munge the data to the desired format for analysis
. This is an observational study. There are 17 cases in general corresponding to each year (from 1960 -2012).
I will be studying 3 variable they are listed below: Year (Quantitative/Explanatory), educational level (Qualitative/Explanatory) and divorce rate (Quantitative/Response) variables
This is an observational study. I arrived at the conclusion by acknowledging that the divorce rates are based on the sample population of couples in U.S. I also noted that the independent variables like year and education level are not under control of the researcher
The subset of the US population used in this study is married couples that were divorced from 1960 to 2012. Also, I limited the data set for couples between age of 35 to 44. We may not be able to generalize the study because of the limited data set. We need sample size to be greater than 30, and 10% of the population. However, I could not verify this information from fiverthirtyeight website.
Summary stats
str(divorce_rate_df)
## 'data.frame': 34 obs. of 3 variables:
## $ Divorce_rate : num 0.0349 0.05 0.1042 0.1594 0.1754 ...
## $ year : int 1960 1970 1980 1990 2000 2001 2002 2003 2004 2005 ...
## $ Education_level: Factor w/ 2 levels "High_school",..: 1 1 1 1 1 1 1 1 1 1 ...
describe.by(divorce_rate_df,group='Education_level')
##
## Descriptive statistics by group
## group: High_school
## vars n mean sd median trimmed mad min
## Divorce_rate 1 17 0.16 0.05 0.18 0.17 0.02 0.03
## year 2 17 1998.71 15.04 2004.00 2000.40 5.93 1960.00
## Education_level* 3 17 1.00 0.00 1.00 1.00 0.00 1.00
## max range skew kurtosis se
## Divorce_rate 0.19 0.16 -1.65 1.21 0.01
## year 2012.00 52.00 -1.38 0.62 3.65
## Education_level* 1.00 0.00 NaN NaN 0.00
## --------------------------------------------------------
## group: Bachelors_degree_more
## vars n mean sd median trimmed mad min max
## Divorce_rate 1 17 0.10 0.02 0.1 0.1 0.00 0.03 0.11
## year 2 17 1998.71 15.04 2004.0 2000.4 5.93 1960.00 2012.00
## Education_level* 3 17 2.00 0.00 2.0 2.0 0.00 2.00 2.00
## range skew kurtosis se
## Divorce_rate 0.09 -2.09 2.88 0.01
## year 52.00 -1.38 0.62 3.65
## Education_level* 0.00 NaN NaN 0.00
ggplot(divorce_rate_df, aes(x=year, y=Divorce_rate ,group=Education_level)) + geom_point(aes(color=Education_level))
ggplot(divorce_rate_df, aes(x=year, y=Divorce_rate ,group=Education_level)) + geom_boxplot(aes(color=Education_level))
We can see a huge spike in divorce rates from 1980’s and a maintained high level of divorce rates in the 2000’s for both education levels. However, their is a much higher rate of divorce for couples with only highschool diploma
I would be performing a multivariable linear regression analysis
m_divorce <- lm(Divorce_rate ~ year + Education_level ,data=divorce_rate_df)
summary(m_divorce)
##
## Call:
## lm(formula = Divorce_rate ~ year + Education_level, data = divorce_rate_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.045595 -0.010588 0.003213 0.008142 0.043961
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -4.3346302 0.4321775 -10.03
## year 0.0022488 0.0002162 10.40
## Education_levelBachelors_degree_more -0.0642718 0.0063116 -10.18
## Pr(>|t|)
## (Intercept) 2.99e-11 ***
## year 1.24e-11 ***
## Education_levelBachelors_degree_more 2.07e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0184 on 31 degrees of freedom
## Multiple R-squared: 0.8724, Adjusted R-squared: 0.8641
## F-statistic: 105.9 on 2 and 31 DF, p-value: 1.389e-14
#plot(divorce_rate_df)
Analyzing the p-value(1.24e-11,2.07e-11) we can see that there is a strong relationship between both year and education level to the divorce rates, since they are much less than our significant level of 0.05.
We can also see a strong determination coefficient in R-squared of 0.8724. In this model we can claim that 87% of variability of the divorce rates is explained by the model
Based on the linear regression summary values. We can construct the linear regression equation for divorce rate of couples with bachelors degree or more
\[divorcerate = -4.3346302 + 0.0022488 \times year -0.0642718 \times Educationlevel\] Since education level is 1 for Bachelors_degree of more and 0 for high school. We can derive the equation of the regression line
Education level of bachelor’s degree or more Regression line \[divorcerate = -4.398902 + 0.0022488 \times year \]
Education level of High School Diploma Regression line \[divorcerate = -4.3346302 + 0.0022488 \times year \]
Plot of Divorce Rates with regression lines
ggplot(divorce_rate_df, aes(x=year, y=Divorce_rate ,group=Education_level)) + geom_point(aes(color=Education_level)) + geom_abline(aes(intercept = -4.398902, slope = 0.0022488,color=Education_level)) +
geom_abline(aes(intercept = -4.3346302, slope = 0.0022488,color=Education_level[2]))
multiLines(m_divorce)
Check conditions
1.Lets check independence of explanatory variables. This will ensure that their is no collinearity Based on common knowledge we can say that YEAR is not depedent on Educational Level
plot(divorce_rate_df$year,residuals(m_divorce),xlab="year")
abline(h = 0, lty = 3) # adds a horizontal dashed line at y = 0
3.Check if residuals are nearly normal ; We will plot the residual qqplot and histogram
hist(m_divorce$residuals)
m_divorce.stdres= rstandard(m_divorce)
qqnorm(m_divorce.stdres,
ylab="Standardized Residuals",
xlab="divorce_rates",
main="Divorce_rate_residuals")
qqline(m_divorce.stdres)
We can see that the histogram looks fairly normal and the qqnorm plots has the data points close to the abline
fit_Bachelors <- -4.398902 + 0.0022488*divorce_rate_df$year
fit_High_school <- -4.3346302 + 0.0022488*divorce_rate_df$year
fit<- c(fit_High_school[1:17],fit_Bachelors[18:34] )
divorce_rate_fit <- cbind(divorce_rate_df,fit)
plot(m_divorce$residuals ~ divorce_rate_fit$fit)
abline(h = 0, lty = 3) # adds a horizontal dashed line at y = 0
plot(fitted(m_divorce), residuals(m_divorce),
xlab="Predicted divorce ratess", ylab="Residuals")
abline(h = 0, lty = 3) # adds a horizontal dashed line at y = 0
Looks not quite constant but it does not look like it is linear.
I can conclude that from the data points, there is a strong relationship between the education level and divorce rates. However, to truely perform a detailed analysis, we would need more data points to ensure a satisfactory sample size criteria
Jason Bryer- MultiLines function