##Question 1, Importing Data
library(QuantPsyc)
## Loading required package: boot
## Loading required package: MASS
##
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:base':
##
## norm
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:boot':
##
## logit
library(corrr)
library(stats)
library(MASS)
library(corrgram)
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
library(corrplot)
## corrplot 0.84 loaded
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
##
## panel.fill
## The following object is masked from 'package:boot':
##
## melanoma
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(rmarkdown)
PJ1a <- read.csv("/Users/Lorraine/Desktop/Project1a.csv")
Checking for out-of-range values
hist(PJ1a$x1)
hist(PJ1a$x2)
hist(PJ1a$x3)
hist(PJ1a$x4)
hist(PJ1a$x5)
hist(PJ1a$y)
summary(PJ1a)
## case x1 x2 x3
## Min. :101.0 Min. :0.0000 Min. :1.450 Min. :2.000
## 1st Qu.:138.2 1st Qu.:0.0000 1st Qu.:2.572 1st Qu.:3.500
## Median :175.5 Median :0.0000 Median :3.000 Median :3.800
## Mean :175.5 Mean :0.3133 Mean :3.018 Mean :3.835
## 3rd Qu.:212.8 3rd Qu.:1.0000 3rd Qu.:3.450 3rd Qu.:4.175
## Max. :250.0 Max. :1.0000 Max. :4.550 Max. :5.000
## x4 x5 y
## Min. :1.780 Min. : 2.00 Min. :0.500
## 1st Qu.:3.470 1st Qu.: 9.00 1st Qu.:1.032
## Median :3.890 Median :11.00 Median :1.310
## Mean :3.806 Mean :11.26 Mean :1.281
## 3rd Qu.:4.192 3rd Qu.:13.00 3rd Qu.:1.500
## Max. :5.000 Max. :20.00 Max. :2.000
describe(PJ1a)
## vars n mean sd median trimmed mad min max range skew
## case 1 150 175.50 43.45 175.50 175.50 55.60 101.00 250.00 149.00 0.00
## x1 2 150 0.31 0.47 0.00 0.27 0.00 0.00 1.00 1.00 0.80
## x2 3 150 3.02 0.62 3.00 3.02 0.67 1.45 4.55 3.10 -0.04
## x3 4 150 3.83 0.59 3.80 3.86 0.44 2.00 5.00 3.00 -0.36
## x4 5 150 3.81 0.64 3.89 3.83 0.49 1.78 5.00 3.22 -0.48
## x5 6 150 11.26 2.87 11.00 11.20 2.97 2.00 20.00 18.00 0.11
## y 7 150 1.28 0.33 1.31 1.29 0.34 0.50 2.00 1.50 -0.21
## kurtosis se
## case -1.22 3.55
## x1 -1.37 0.04
## x2 -0.32 0.05
## x3 0.27 0.05
## x4 0.33 0.05
## x5 0.32 0.23
## y -0.57 0.03
There is no out-of-range values in the data. The means and standard deviations for each variable are reasonable given the ranges.
##Question 2
cormatrix <- cor(PJ1a, use="pairwise.complete.obs")
corrplot(cormatrix, method = "square", type="upper", order="original", tl.col="black", tl.srt=45, addCoef.col = TRUE, number.cex = .5)
The correlation between x1 and y is 0.09, the correlation between x2 and y is -0.07, the correlation between x3 and y is 0.08, the correlation between x4 and y is 0.11, and the correlation between x5 and y is -0.27. I would retain all 5 variables if I were providing advice to a client/supervisor interested in best predicting the criterion.
##Question 3a
regall<-lm(y ~ x1+x2+x3+x4+x5, data = PJ1a)
summary(regall)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = PJ1a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.80964 -0.18829 0.02852 0.23992 0.58070
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.291263 0.218588 5.907 2.4e-08 ***
## x1 0.041506 0.055738 0.745 0.457689
## x2 -0.041833 0.043713 -0.957 0.340176
## x3 0.061307 0.049067 1.249 0.213530
## x4 0.066487 0.044301 1.501 0.135594
## x5 -0.034192 0.009197 -3.718 0.000287 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3137 on 144 degrees of freedom
## Multiple R-squared: 0.1159, Adjusted R-squared: 0.08518
## F-statistic: 3.775 on 5 and 144 DF, p-value: 0.003052
y=1.29+0.042x1-0.042x2+0.061x3+0.066x4-0.034x5 A multiple regression was used to determine the extent to which the five predictor variables explained variance in the criterion variable. Results revealed that the model with all five predictors significantly predicted the creterion, R square = .12, F(5, 144) = 3.775, p < .05. However, only X5 significantly contributed to the model.
##Question 3b
reg2<-lm(y~x1+x2, data = PJ1a)
reg3<- lm(y~x1+x2+x3+x4+x5, data = PJ1a)
m <- lm(y~1, data = PJ1a)
summary(reg2)
##
## Call:
## lm(formula = y ~ x1 + x2, data = PJ1a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.77552 -0.23177 0.05461 0.23652 0.73351
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.36385 0.13738 9.928 <2e-16 ***
## x1 0.05843 0.05802 1.007 0.316
## x2 -0.03346 0.04371 -0.765 0.445
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3283 on 147 degrees of freedom
## Multiple R-squared: 0.01178, Adjusted R-squared: -0.001669
## F-statistic: 0.8759 on 2 and 147 DF, p-value: 0.4187
summary(reg3)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = PJ1a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.80964 -0.18829 0.02852 0.23992 0.58070
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.291263 0.218588 5.907 2.4e-08 ***
## x1 0.041506 0.055738 0.745 0.457689
## x2 -0.041833 0.043713 -0.957 0.340176
## x3 0.061307 0.049067 1.249 0.213530
## x4 0.066487 0.044301 1.501 0.135594
## x5 -0.034192 0.009197 -3.718 0.000287 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3137 on 144 degrees of freedom
## Multiple R-squared: 0.1159, Adjusted R-squared: 0.08518
## F-statistic: 3.775 on 5 and 144 DF, p-value: 0.003052
anova(m, reg2,reg3)
## Analysis of Variance Table
##
## Model 1: y ~ 1
## Model 2: y ~ x1 + x2
## Model 3: y ~ x1 + x2 + x3 + x4 + x5
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 149 16.030
## 2 147 15.841 2 0.18877 0.9590 0.38570
## 3 144 14.172 3 1.66870 5.6518 0.00109 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
y2=1.36+0.058x1-0.033x2 A multiple regression was used to determine the extent to which the two predictor variables explained variance in the criterion variable. Results revealed that the model with both two predictors did not significantly predict the creterion, R square = .01, F(2, 147) = 0.8759, p > .05. y3=1.29+0.042x1-0.042x2+0.061x3+0.066x4-0.034x5 A multiple regression was used to determine the extent to which the five predictor variables explained variance in the criterion variable. Results revealed that the model with all five predictors significantly predicted the creterion, R square = .12, F(5, 144) = 3.775, p < .05. However, only X5 significantly contributed to the model. An anova analysis was used to determine whether a statistically significant change occurred bwtween the two models. A baseline model was created to see whether model 1 had a significance itself, and the result suggested that model 1 was not statistically significant, F(2, 147) = 0.959, p>.05.there was a significant change in two models, F(3, 144) = 5.65, p < .05.
##Question 3c
FWD<- stepAIC(regall, direction = "forward", trace=FALSE)
summary(FWD)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = PJ1a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.80964 -0.18829 0.02852 0.23992 0.58070
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.291263 0.218588 5.907 2.4e-08 ***
## x1 0.041506 0.055738 0.745 0.457689
## x2 -0.041833 0.043713 -0.957 0.340176
## x3 0.061307 0.049067 1.249 0.213530
## x4 0.066487 0.044301 1.501 0.135594
## x5 -0.034192 0.009197 -3.718 0.000287 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3137 on 144 degrees of freedom
## Multiple R-squared: 0.1159, Adjusted R-squared: 0.08518
## F-statistic: 3.775 on 5 and 144 DF, p-value: 0.003052
The result suggested that X5 has the most significant t-value, followed by x4, x3, x2, and x1. ##Importint data for question 4-6
PJ1b <- read.csv("/Users/Lorraine/Desktop/Project1b.csv")
##Question 4
PJ1b$dept<-as.factor(PJ1b$dept)
contrasts(PJ1b$dept)<-contr.treatment(3, base = 3)
PJ1b$dept
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3
## attr(,"contrasts")
## 1 2
## 1 1 0
## 2 0 1
## 3 0 0
## Levels: 1 2 3
reg4 <- lm(salary ~ dept, data = PJ1b)
summary(reg4)
##
## Call:
## lm(formula = salary ~ dept, data = PJ1b)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.15 -5.60 -2.60 4.40 34.85
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.600 3.223 16.938 6.56e-16 ***
## dept1 -9.446 4.288 -2.203 0.0363 *
## dept2 -8.886 5.023 -1.769 0.0882 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.19 on 27 degrees of freedom
## Multiple R-squared: 0.1693, Adjusted R-squared: 0.1078
## F-statistic: 2.752 on 2 and 27 DF, p-value: 0.08173
coefficients(reg4)
## (Intercept) dept1 dept2
## 54.600000 -9.446154 -8.885714
by(PJ1b$salary, PJ1b$dept, mean)
## PJ1b$dept: 1
## [1] 45.15385
## --------------------------------------------------------
## PJ1b$dept: 2
## [1] 45.71429
## --------------------------------------------------------
## PJ1b$dept: 3
## [1] 54.6
y=54.6-9.446d1-8.886d2 A multiple regression was used to determine the extent to which the two predictor variables (psychology and biology) explained variance in the criterion variable(salary) based a reference variable (math). Results suggested that the model with both two predictors did not significantly predict the creterion, R square = .17, F(2, 27) = 2.752, p > .05. However, psychology was significantly negatively correlated with salary. The mean salary of department psychology is 45,15, 45.71 for biology, and 54.6 for math.
##Question 5
contrasts(PJ1b$dept)<-contr.sum(3)
PJ1b$dept
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3
## attr(,"contrasts")
## [,1] [,2]
## 1 1 0
## 2 0 1
## 3 -1 -1
## Levels: 1 2 3
reg5 <- lm(salary ~ dept, data =PJ1b)
summary(reg5)
##
## Call:
## lm(formula = salary ~ dept, data = PJ1b)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.15 -5.60 -2.60 4.40 34.85
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.489 1.921 25.236 <2e-16 ***
## dept1 -3.336 2.521 -1.323 0.197
## dept2 -2.775 2.939 -0.944 0.353
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.19 on 27 degrees of freedom
## Multiple R-squared: 0.1693, Adjusted R-squared: 0.1078
## F-statistic: 2.752 on 2 and 27 DF, p-value: 0.08173
coefficients(reg5)
## (Intercept) dept1 dept2
## 48.489377 -3.335531 -2.775092
y=48.489-3.336d1-2.775d2 An unweighted multiple regression was used to determine the extent to which the two predictor variables (psychology and biology) explained variance in the criterion variable(salary) based a reference variable (math). Results suggested that the model with both two predictors did not significantly predict the creterion, R square = .17, F(2, 27) = 2.752, p > .05. The grand mean is 48.489. The difference between the grand mean and psychology department is 3.336, and the difference bwteen the grand mean and biology department is 2.775.
##Question 6
contrasts(PJ1b$dept)<-contr.treatment(3, base = 3)
PJ1b$dept
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3
## attr(,"contrasts")
## 1 2
## 1 1 0
## 2 0 1
## 3 0 0
## Levels: 1 2 3
reg6 <- lm(salary ~ dept+years+merit, data = PJ1b)
summary(reg6)
##
## Call:
## lm(formula = salary ~ dept + years + merit, data = PJ1b)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.1097 -4.4762 -0.4368 5.1278 27.1315
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.4006 7.1289 7.491 7.63e-08 ***
## dept1 -10.8706 4.3233 -2.514 0.0187 *
## dept2 -10.8914 5.0564 -2.154 0.0411 *
## years 1.4156 0.7559 1.873 0.0729 .
## merit -0.6581 2.5157 -0.262 0.7958
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.707 on 25 degrees of freedom
## Multiple R-squared: 0.3025, Adjusted R-squared: 0.191
## F-statistic: 2.711 on 4 and 25 DF, p-value: 0.05291
coefficients(reg6)
## (Intercept) dept1 dept2 years merit
## 53.4006200 -10.8706218 -10.8913660 1.4155975 -0.6580531
Forward<- stepAIC(reg6, direction = "forward", trace=FALSE)
summary(Forward)
##
## Call:
## lm(formula = salary ~ dept + years + merit, data = PJ1b)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.1097 -4.4762 -0.4368 5.1278 27.1315
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.4006 7.1289 7.491 7.63e-08 ***
## dept1 -10.8706 4.3233 -2.514 0.0187 *
## dept2 -10.8914 5.0564 -2.154 0.0411 *
## years 1.4156 0.7559 1.873 0.0729 .
## merit -0.6581 2.5157 -0.262 0.7958
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.707 on 25 degrees of freedom
## Multiple R-squared: 0.3025, Adjusted R-squared: 0.191
## F-statistic: 2.711 on 4 and 25 DF, p-value: 0.05291
Backward <- stepAIC(reg6, direction = "backward", trace=FALSE)
summary(Backward)
##
## Call:
## lm(formula = salary ~ dept + years, data = PJ1b)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.6335 -4.2676 -0.9386 4.8647 27.7783
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.7529 3.2779 15.788 7.73e-15 ***
## dept1 -11.1783 4.0850 -2.736 0.0110 *
## dept2 -11.2151 4.8139 -2.330 0.0279 *
## years 1.2941 0.5857 2.210 0.0361 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.531 on 26 degrees of freedom
## Multiple R-squared: 0.3006, Adjusted R-squared: 0.2199
## F-statistic: 3.726 on 3 and 26 DF, p-value: 0.02367
Stepwise<- stepAIC(reg6, direction = "both", trace=FALSE)
summary(Stepwise)
##
## Call:
## lm(formula = salary ~ dept + years, data = PJ1b)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.6335 -4.2676 -0.9386 4.8647 27.7783
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.7529 3.2779 15.788 7.73e-15 ***
## dept1 -11.1783 4.0850 -2.736 0.0110 *
## dept2 -11.2151 4.8139 -2.330 0.0279 *
## years 1.2941 0.5857 2.210 0.0361 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.531 on 26 degrees of freedom
## Multiple R-squared: 0.3006, Adjusted R-squared: 0.2199
## F-statistic: 3.726 on 3 and 26 DF, p-value: 0.02367
The most efficient predictors are the both department predictors and years. The result suggested that Psychology department is the most significant predictor (t-value = -2.736, p < .05), followed by biology department (t-value = -2.330, p < .05), and years(t-value = 2.210, p < .05). The backward removal model (adjusted R-squared = 0.22) is a better fit than the forward Entry Model(adjusted R-squared = 0.19).