#create data frame
# Staus '0' - Divorced or Single
# Status '' Married
df <- data.frame(income=c(45000, 48000, 54000, 57000, 85000, 69000,
                          88000, 83000, 98000, 104000, 107000),
                 age=c(23, 25, 24, 29, 38, 36, 40, 59, 56, 64, 53),
                 status=c(0, 0, 0, 0,
                          0, 1, 1, 1,
                          0, 1, 1)
                 
                 )
df
##    income age status
## 1   45000  23      0
## 2   48000  25      0
## 3   54000  24      0
## 4   57000  29      0
## 5   85000  38      0
## 6   69000  36      1
## 7   88000  40      1
## 8   83000  59      1
## 9   98000  56      0
## 10 104000  64      1
## 11 107000  53      1
#create data frame to use for regression
df_reg <- data.frame(income = df$income,
                     age = df$age,
                     status = df$status
                     )

#view data frame
df_reg
##    income age status
## 1   45000  23      0
## 2   48000  25      0
## 3   54000  24      0
## 4   57000  29      0
## 5   85000  38      0
## 6   69000  36      1
## 7   88000  40      1
## 8   83000  59      1
## 9   98000  56      0
## 10 104000  64      1
## 11 107000  53      1

Multiple Linear Regression Model:

Then I will build the multiple regression model to predict the Income based on Martial status. As requiement, the model will include one quadratic term ( Age^2 Number), one dichotomous term(Maritial Status), and one dichotomous vs.quantitative interaction term: Maritial Status * Age^2

#create regression model
model <- lm(income ~ age^2 + status + (age)*(status) , data=df_reg)

#view regression model output
summary(model)
## 
## Call:
## lm(formula = income ~ age^2 + status + (age) * (status), data = df_reg)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -14563  -4525  -1802   5022  14574 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  11585.1    12593.7   0.920  0.38822   
## age           1628.1      364.8   4.463  0.00292 **
## status       35464.4    25561.5   1.387  0.20788   
## age:status    -772.0      565.1  -1.366  0.21417   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10400 on 7 degrees of freedom
## Multiple R-squared:  0.8526, Adjusted R-squared:  0.7895 
## F-statistic:  13.5 on 3 and 7 DF,  p-value: 0.002692
lm(formula = income ~ age^2 + status + (age)*(status) , data=df_reg)
## 
## Call:
## lm(formula = income ~ age^2 + status + (age) * (status), data = df_reg)
## 
## Coefficients:
## (Intercept)          age       status   age:status  
##       11585         1628        35464         -772

The model looks good as the p-value is less than 0.05.

Age : Since the p-value(0.002) is less than .05 age is significant predictor of income. Status: Since the p-value(0.2) is more than .05 status is not significant predictor of income.

We can use this equation to find the estimated income for an individual based on their age and marital status. For example, an individual who is 35 years old and married is estimated to have an income of $68,264:

Income = 11585.1 + 1628.1(35) + 35464.4(1) – 772(351)

Income <- (11585.1 + 1628.1*(35) + 35464.4*(1)) - (772*(35*1))
Income
## [1] 77013

we could drop marital status as a predictor from the model because it doesn’t appear to add any predictive value for income.

##Residual Analysis

plot(fitted(model),resid(model))
abline(0,0)

qqnorm(resid(model))
qqline(resid(model))

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.