##Question 1, Importing Data

library(QuantPsyc)
## Loading required package: boot
## Loading required package: MASS
## 
## Attaching package: 'QuantPsyc'
## The following object is masked from 'package:base':
## 
##     norm
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:boot':
## 
##     logit
library(corrr)
library(stats)
library(MASS)
library(corrgram)
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
library(corrplot)
## corrplot 0.84 loaded
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
## 
##     panel.fill
## The following object is masked from 'package:boot':
## 
##     melanoma
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(rmarkdown)
PJ1a <- read.csv("/Users/Lorraine/Desktop/Project1a.csv")

Checking for out-of-range values

hist(PJ1a$x1)

hist(PJ1a$x2)

hist(PJ1a$x3)

hist(PJ1a$x4)

hist(PJ1a$x5)

hist(PJ1a$y)

summary(PJ1a)
##       case             x1               x2              x3       
##  Min.   :101.0   Min.   :0.0000   Min.   :1.450   Min.   :2.000  
##  1st Qu.:138.2   1st Qu.:0.0000   1st Qu.:2.572   1st Qu.:3.500  
##  Median :175.5   Median :0.0000   Median :3.000   Median :3.800  
##  Mean   :175.5   Mean   :0.3133   Mean   :3.018   Mean   :3.835  
##  3rd Qu.:212.8   3rd Qu.:1.0000   3rd Qu.:3.450   3rd Qu.:4.175  
##  Max.   :250.0   Max.   :1.0000   Max.   :4.550   Max.   :5.000  
##        x4              x5              y        
##  Min.   :1.780   Min.   : 2.00   Min.   :0.500  
##  1st Qu.:3.470   1st Qu.: 9.00   1st Qu.:1.032  
##  Median :3.890   Median :11.00   Median :1.310  
##  Mean   :3.806   Mean   :11.26   Mean   :1.281  
##  3rd Qu.:4.192   3rd Qu.:13.00   3rd Qu.:1.500  
##  Max.   :5.000   Max.   :20.00   Max.   :2.000
describe(PJ1a)
##      vars   n   mean    sd median trimmed   mad    min    max  range  skew
## case    1 150 175.50 43.45 175.50  175.50 55.60 101.00 250.00 149.00  0.00
## x1      2 150   0.31  0.47   0.00    0.27  0.00   0.00   1.00   1.00  0.80
## x2      3 150   3.02  0.62   3.00    3.02  0.67   1.45   4.55   3.10 -0.04
## x3      4 150   3.83  0.59   3.80    3.86  0.44   2.00   5.00   3.00 -0.36
## x4      5 150   3.81  0.64   3.89    3.83  0.49   1.78   5.00   3.22 -0.48
## x5      6 150  11.26  2.87  11.00   11.20  2.97   2.00  20.00  18.00  0.11
## y       7 150   1.28  0.33   1.31    1.29  0.34   0.50   2.00   1.50 -0.21
##      kurtosis   se
## case    -1.22 3.55
## x1      -1.37 0.04
## x2      -0.32 0.05
## x3       0.27 0.05
## x4       0.33 0.05
## x5       0.32 0.23
## y       -0.57 0.03

There is no out-of-range values in the data. The means and standard deviations for each variable are reasonable given the ranges.

##Question 2

cormatrix <- cor(PJ1a, use="pairwise.complete.obs")
corrplot(cormatrix, method = "square", type="upper", order="original", tl.col="black", tl.srt=45, addCoef.col = TRUE, number.cex = .5)

The correlation between x1 and y is 0.09, the correlation between x2 and y is -0.07, the correlation between x3 and y is 0.08, the correlation between x4 and y is 0.11, and the correlation between x5 and y is -0.27. I would retain all 5 variables if I were providing advice to a client/supervisor interested in best predicting the criterion.

##Question 3a

regall<-lm(y ~ x1+x2+x3+x4+x5, data = PJ1a)
summary(regall)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = PJ1a)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.80964 -0.18829  0.02852  0.23992  0.58070 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.291263   0.218588   5.907  2.4e-08 ***
## x1           0.041506   0.055738   0.745 0.457689    
## x2          -0.041833   0.043713  -0.957 0.340176    
## x3           0.061307   0.049067   1.249 0.213530    
## x4           0.066487   0.044301   1.501 0.135594    
## x5          -0.034192   0.009197  -3.718 0.000287 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3137 on 144 degrees of freedom
## Multiple R-squared:  0.1159, Adjusted R-squared:  0.08518 
## F-statistic: 3.775 on 5 and 144 DF,  p-value: 0.003052

y=1.29+0.042x1-0.042x2+0.061x3+0.066x4-0.034x5 A multiple regression was used to determine the extent to which the five predictor variables explained variance in the criterion variable. Results revealed that the model with all five predictors significantly predicted the creterion, R square = .12, F(5, 144) = 3.775, p < .05. However, only X5 significantly contributed to the model.

##Question 3b

reg2<-lm(y~x1+x2, data = PJ1a)
reg3<- lm(y~x1+x2+x3+x4+x5, data = PJ1a)
m <- lm(y~1, data = PJ1a)
summary(reg2)
## 
## Call:
## lm(formula = y ~ x1 + x2, data = PJ1a)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.77552 -0.23177  0.05461  0.23652  0.73351 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.36385    0.13738   9.928   <2e-16 ***
## x1           0.05843    0.05802   1.007    0.316    
## x2          -0.03346    0.04371  -0.765    0.445    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3283 on 147 degrees of freedom
## Multiple R-squared:  0.01178,    Adjusted R-squared:  -0.001669 
## F-statistic: 0.8759 on 2 and 147 DF,  p-value: 0.4187
summary(reg3)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = PJ1a)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.80964 -0.18829  0.02852  0.23992  0.58070 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.291263   0.218588   5.907  2.4e-08 ***
## x1           0.041506   0.055738   0.745 0.457689    
## x2          -0.041833   0.043713  -0.957 0.340176    
## x3           0.061307   0.049067   1.249 0.213530    
## x4           0.066487   0.044301   1.501 0.135594    
## x5          -0.034192   0.009197  -3.718 0.000287 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3137 on 144 degrees of freedom
## Multiple R-squared:  0.1159, Adjusted R-squared:  0.08518 
## F-statistic: 3.775 on 5 and 144 DF,  p-value: 0.003052
anova(m, reg2,reg3)
## Analysis of Variance Table
## 
## Model 1: y ~ 1
## Model 2: y ~ x1 + x2
## Model 3: y ~ x1 + x2 + x3 + x4 + x5
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)   
## 1    149 16.030                               
## 2    147 15.841  2   0.18877 0.9590 0.38570   
## 3    144 14.172  3   1.66870 5.6518 0.00109 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

y2=1.36+0.058x1-0.033x2 A multiple regression was used to determine the extent to which the two predictor variables explained variance in the criterion variable. Results revealed that the model with both two predictors did not significantly predict the creterion, R square = .01, F(2, 147) = 0.8759, p > .05. y3=1.29+0.042x1-0.042x2+0.061x3+0.066x4-0.034x5 A multiple regression was used to determine the extent to which the five predictor variables explained variance in the criterion variable. Results revealed that the model with all five predictors significantly predicted the creterion, R square = .12, F(5, 144) = 3.775, p < .05. However, only X5 significantly contributed to the model. An anova analysis was used to determine whether a statistically significant change occurred bwtween the two models. A baseline model was created to see whether model 1 had a significance itself, and the result suggested that model 1 was not statistically significant, F(2, 147) = 0.959, p>.05.there was a significant change in two models, F(3, 144) = 5.65, p < .05.

##Question 3c

FWD<- stepAIC(regall, direction = "forward", trace=FALSE)
summary(FWD)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5, data = PJ1a)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.80964 -0.18829  0.02852  0.23992  0.58070 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.291263   0.218588   5.907  2.4e-08 ***
## x1           0.041506   0.055738   0.745 0.457689    
## x2          -0.041833   0.043713  -0.957 0.340176    
## x3           0.061307   0.049067   1.249 0.213530    
## x4           0.066487   0.044301   1.501 0.135594    
## x5          -0.034192   0.009197  -3.718 0.000287 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3137 on 144 degrees of freedom
## Multiple R-squared:  0.1159, Adjusted R-squared:  0.08518 
## F-statistic: 3.775 on 5 and 144 DF,  p-value: 0.003052

The result suggested that X5 has the most significant t-value, followed by x4, x3, x2, and x1. ##Importint data for question 4-6

PJ1b <- read.csv("/Users/Lorraine/Desktop/Project1b.csv")

##Question 4

PJ1b$dept<-as.factor(PJ1b$dept)
contrasts(PJ1b$dept)<-contr.treatment(3, base = 3)
PJ1b$dept
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3
## attr(,"contrasts")
##   1 2
## 1 1 0
## 2 0 1
## 3 0 0
## Levels: 1 2 3
reg4 <- lm(salary ~ dept, data = PJ1b)
summary(reg4)
## 
## Call:
## lm(formula = salary ~ dept, data = PJ1b)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15.15  -5.60  -2.60   4.40  34.85 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   54.600      3.223  16.938 6.56e-16 ***
## dept1         -9.446      4.288  -2.203   0.0363 *  
## dept2         -8.886      5.023  -1.769   0.0882 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.19 on 27 degrees of freedom
## Multiple R-squared:  0.1693, Adjusted R-squared:  0.1078 
## F-statistic: 2.752 on 2 and 27 DF,  p-value: 0.08173
coefficients(reg4)
## (Intercept)       dept1       dept2 
##   54.600000   -9.446154   -8.885714
by(PJ1b$salary, PJ1b$dept, mean)
## PJ1b$dept: 1
## [1] 45.15385
## -------------------------------------------------------- 
## PJ1b$dept: 2
## [1] 45.71429
## -------------------------------------------------------- 
## PJ1b$dept: 3
## [1] 54.6

y=54.6-9.446d1-8.886d2 A multiple regression was used to determine the extent to which the two predictor variables (psychology and biology) explained variance in the criterion variable(salary) based a reference variable (math). Results suggested that the model with both two predictors did not significantly predict the creterion, R square = .17, F(2, 27) = 2.752, p > .05. However, psychology was significantly negatively correlated with salary. The mean salary of department psychology is 45,15, 45.71 for biology, and 54.6 for math.

##Question 5

contrasts(PJ1b$dept)<-contr.sum(3)
PJ1b$dept
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3
## attr(,"contrasts")
##   [,1] [,2]
## 1    1    0
## 2    0    1
## 3   -1   -1
## Levels: 1 2 3
reg5 <- lm(salary ~ dept, data =PJ1b)
summary(reg5)
## 
## Call:
## lm(formula = salary ~ dept, data = PJ1b)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15.15  -5.60  -2.60   4.40  34.85 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   48.489      1.921  25.236   <2e-16 ***
## dept1         -3.336      2.521  -1.323    0.197    
## dept2         -2.775      2.939  -0.944    0.353    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.19 on 27 degrees of freedom
## Multiple R-squared:  0.1693, Adjusted R-squared:  0.1078 
## F-statistic: 2.752 on 2 and 27 DF,  p-value: 0.08173
coefficients(reg5)
## (Intercept)       dept1       dept2 
##   48.489377   -3.335531   -2.775092

y=48.489-3.336d1-2.775d2 An unweighted multiple regression was used to determine the extent to which the two predictor variables (psychology and biology) explained variance in the criterion variable(salary) based a reference variable (math). Results suggested that the model with both two predictors did not significantly predict the creterion, R square = .17, F(2, 27) = 2.752, p > .05. The grand mean is 48.489. The difference between the grand mean and psychology department is 3.336, and the difference bwteen the grand mean and biology department is 2.775.

##Question 6

contrasts(PJ1b$dept)<-contr.treatment(3, base = 3)
PJ1b$dept
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3
## attr(,"contrasts")
##   1 2
## 1 1 0
## 2 0 1
## 3 0 0
## Levels: 1 2 3
reg6 <- lm(salary ~ dept+years+merit, data = PJ1b)
summary(reg6)
## 
## Call:
## lm(formula = salary ~ dept + years + merit, data = PJ1b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.1097  -4.4762  -0.4368   5.1278  27.1315 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  53.4006     7.1289   7.491 7.63e-08 ***
## dept1       -10.8706     4.3233  -2.514   0.0187 *  
## dept2       -10.8914     5.0564  -2.154   0.0411 *  
## years         1.4156     0.7559   1.873   0.0729 .  
## merit        -0.6581     2.5157  -0.262   0.7958    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.707 on 25 degrees of freedom
## Multiple R-squared:  0.3025, Adjusted R-squared:  0.191 
## F-statistic: 2.711 on 4 and 25 DF,  p-value: 0.05291
coefficients(reg6)
## (Intercept)       dept1       dept2       years       merit 
##  53.4006200 -10.8706218 -10.8913660   1.4155975  -0.6580531
Forward<- stepAIC(reg6, direction = "forward", trace=FALSE)
summary(Forward)
## 
## Call:
## lm(formula = salary ~ dept + years + merit, data = PJ1b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.1097  -4.4762  -0.4368   5.1278  27.1315 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  53.4006     7.1289   7.491 7.63e-08 ***
## dept1       -10.8706     4.3233  -2.514   0.0187 *  
## dept2       -10.8914     5.0564  -2.154   0.0411 *  
## years         1.4156     0.7559   1.873   0.0729 .  
## merit        -0.6581     2.5157  -0.262   0.7958    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.707 on 25 degrees of freedom
## Multiple R-squared:  0.3025, Adjusted R-squared:  0.191 
## F-statistic: 2.711 on 4 and 25 DF,  p-value: 0.05291
Backward <- stepAIC(reg6, direction = "backward", trace=FALSE)
summary(Backward)
## 
## Call:
## lm(formula = salary ~ dept + years, data = PJ1b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.6335  -4.2676  -0.9386   4.8647  27.7783 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  51.7529     3.2779  15.788 7.73e-15 ***
## dept1       -11.1783     4.0850  -2.736   0.0110 *  
## dept2       -11.2151     4.8139  -2.330   0.0279 *  
## years         1.2941     0.5857   2.210   0.0361 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.531 on 26 degrees of freedom
## Multiple R-squared:  0.3006, Adjusted R-squared:  0.2199 
## F-statistic: 3.726 on 3 and 26 DF,  p-value: 0.02367
Stepwise<- stepAIC(reg6, direction = "both", trace=FALSE)
summary(Stepwise)
## 
## Call:
## lm(formula = salary ~ dept + years, data = PJ1b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.6335  -4.2676  -0.9386   4.8647  27.7783 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  51.7529     3.2779  15.788 7.73e-15 ***
## dept1       -11.1783     4.0850  -2.736   0.0110 *  
## dept2       -11.2151     4.8139  -2.330   0.0279 *  
## years         1.2941     0.5857   2.210   0.0361 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.531 on 26 degrees of freedom
## Multiple R-squared:  0.3006, Adjusted R-squared:  0.2199 
## F-statistic: 3.726 on 3 and 26 DF,  p-value: 0.02367

The most efficient predictors are the both department predictors and years. The result suggested that Psychology department is the most significant predictor (t-value = -2.736, p < .05), followed by biology department (t-value = -2.330, p < .05), and years(t-value = 2.210, p < .05). The backward removal model (adjusted R-squared = 0.22) is a better fit than the forward Entry Model(adjusted R-squared = 0.19).