/
survey = read.csv("https://raw.githubusercontent.com/connormckee1323/STA-490/main/w09-SurveyDataCsvFinal.csv", head = TRUE)
# names(survey)
The original survey data have three components, a self-compassion scale and gratitude questionnaire instruments, and some demographic questions.
The three components have different portions of missing values. We split the original data set into three subsets of data and impute the missing values related to the self-compassion and gratitude data based on the survey instruments. Since there are only a few missing values, we replace the missing values in each survey question with the mode of the associated survey item. We create indexes of the two instruments separately to aggregate the information in the two survey data sets.
Since R does not have a function to find the model of a given data set, I write the following function to find the model of a data set.
We will perform both principal component analysis (PCA) and exploratory factor analysis (EFA).
This instrument contains only the data associated with the 12 items in the survey instrument. In the original data file, the 12 variables are named Q2_1, Q2_2, …, Q2_12. We impute the missing value by replacing the missing value in each of the 12 items with the mode of the corresponding survey items. Since there are only a few missing values in this instrument, this imputation will not impact the subsequent PCA and EFA.
my.mode = function(dataset){
freq.tbl = table(dataset)
max.freq.id=which(freq.tbl==max(freq.tbl))
mode=names(freq.tbl[max.freq.id])
as.numeric(mode)
}
compassion = survey[, 1:12]
# imputing with the mode in each survey item
for (i in 1:12) {
compassion[,i][is.na(compassion[,i])]=my.mode(compassion[,i])
}
The gratitude questionnaire contains only the variables associated with gratitude questions. The variables used in the original data file are Q3_1, Q3_2, …, Q3_6. We use the same mode imputation method to fill in the missing values as used in the above self-compassion survey data. The gratitude questionnaire has even fewer missing values. Any imputation will not impact any subsequent analysis.
gratitude = survey[, 13:18]
# imputing with the mode in each survey item
for (i in 1:6) {
gratitude[,i][is.na(gratitude[,i])]=my.mode(gratitude[,i])
}
Since Likert scales of the Q3_3 and Q3_6 were in reverse order in the design. We transform back the usual order and create a new dataset using the same variable names.
gratitude.new = gratitude
gratitude.new$Q3_3 = 8-gratitude$Q3_3
gratitude.new$Q3_6 = 8-gratitude$Q3_6
The demographic variables have two issues: missing values and imbalance categories.The major issue of these categorical variables is the imbalance category. The following modifications to the original demographic variables are utilized.
survey00 = survey[, (24:32)]
survey00$Q9 <- ifelse(is.na(survey00$Q9), 99, survey00$Q9)
survey00$Q11 <- ifelse(is.na(survey00$Q11), 99, survey00$Q11)
survey00$Q13 <- ifelse(is.na(survey00$Q13), 99, survey00$Q13)
survey00$Q14 <- ifelse(is.na(survey00$Q14), 99, survey00$Q14)
survey00$Q15 <- ifelse(is.na(survey00$Q15), 99, survey00$Q15)
survey00$Q16 <- ifelse(is.na(survey00$Q16), 99, survey00$Q16)
survey00$Q17 <- ifelse(is.na(survey00$Q17), 99, survey00$Q17)
survey00$Q18 <- ifelse(is.na(survey00$Q18), 99, survey00$Q18)
survey00$Q19 <- ifelse(is.na(survey00$Q19), 99, survey00$Q19)
list(Q9=table(survey00$Q9),
Q11=table(survey00$Q11),
Q13=table(survey00$Q13),
Q14=table(survey00$Q14),
Q15=table(survey00$Q15),
Q16=table(survey00$Q16),
Q17=table(survey00$Q17),
Q18=table(survey00$Q18),
Q19=table(survey00$Q19))
demographic = survey[, -(1:18)]
demographic00=demographic
# replace missing values with 99.
demographic00[is.na(demographic00)] <- 99
As a rule of thumb, each category in the redefined variables must have at least 20 subjects to guarantee that the subsequent regression modeling is valid. I am going to redefine varibles for Q9, Q11, Q13, Q15, Q16, and Q17 since these variables have categories less than 20.
Gender = Q9: 1=“Male”, 2=“Female”, 3=“Other”
Race = Q11: 1=“White”, 2=“African-American/Black”, 5=“Hispanic”
Marital.st = Q13: 1=“Single”, 2=“Married/Civil Partner”, 3-4-6=“Other”
Religion = Q15: 1-8=“Religous” 9-100=“Non-Religous”
Sexual.orient = Q16: 1-4=“Non-straight”, 5=“Straight”, 6-10=“Non-straight”
Poli.affil = Q17: 1-3=“Republican”, 4=“Independent”, 5-7=“Democrat”
Q8.1=demographic00$Q8_1
grp.age=cut(Q8.1, breaks=c(1, 23, 30, 100), labels=c("[1,23]", "[24,30]", "[30,99]"))
#
Q8.2=demographic00$Q8_2
grp.edu=cut(Q8.2, breaks=c(0, 15.5, 19, 100), labels=c("Assoc", "Bachelor", "Adv.deg"))
#
Q8.3=demographic00$Q8_3
grp.empl=cut(Q8.3, breaks=c(-1,5, 9, 100), labels=c("entry", "junior", "senior"))
#
Q8.5=demographic00$Q8_5
kid.num=cut(Q8.5, breaks=c(-1,1,100), labels=c("No-kid", "With-kid"))
#
Q8.6=demographic00$Q8_6
home.size=cut(Q8.6, breaks=c(-1,2,100), labels=c("1-2", "3+"))
#
#
#
Q.20=demographic00$Q20
spirituality=rep("high", length(Q.20))
spirituality[which(Q.20==2)]="moderate"
spirituality[Q.20 %in% c(1,2,3)]="low"
#
Q9=survey00$Q9
gender= cut(Q9, breaks=c(0,1,2,100), labels=c("Male", "Female", "Other"))
Q11=survey00$Q11
race=cut(Q11, breaks=c(0,1,2,5), labels=c("White", "African American/Black", "Hispanic"))
Q13=survey00$Q13
marital.st=cut(Q13, breaks=c(0,1,2,100), labels=c("Single", "Married/Civil Partner", "Other"))
Q14=survey00$Q14
disability=cut(Q14, breaks=c(0,1,2), labels=c("Yes", "No"))
Q15=survey00$Q15
religion=cut(Q15, breaks=c(0,8,100), labels=c("Religous", "Non-Religous"))
Q16=survey00$Q16
sexual.orient=cut(Q16, breaks=c(0,4,5,100), labels=c("Non-Straight", "Straight", "Non-Straight"))
Q17=survey00$Q17
poli.affil=cut(Q17, breaks=c(0,3,4,100), labels=c("Republican", "Independent", "Democrat"))
Q18=survey00$Q18
SW.program=cut(Q18, breaks=c(0,1,100), labels=c("Undergraduate", "Graduate"))
Q19=survey00$Q19
urbanity=cut(Q19, breaks=c(0,1,2,100), labels=c("Urban", "Rural", "Suburban"))
new.demographics=data.frame(gender, race, marital.st, disability, religion, sexual.orient, poli.affil, SW.program, urbanity,grp.age, grp.edu, grp.empl, kid.num, home.size,spirituality)
Variables gender, race, sexual.orient, and poli.affil are variables that will not be valid to guarantee subsequent regression modeling.
For example, gender has 7 males, 94 females, and 3 other. There is no way to combine these categories to get 20 in each category. You can only be in one of the three.
Another example is sexual.orient. There are many different options in the questionnaire, but if you condense them, there is either straight or non-straight. Only 2 straight people filled out the questionnaire so this variable will not be valid to guarantee that the subsequent regression modeling.
pca <- prcomp(compassion, center = TRUE, scale = TRUE)
sc.idx.1 = pca$x[,1] # This is the first principal component
# you are expected to include the second
# principal component in the final data set as well.
sc.idx.2 = pca$x[,2]
gr.pca <- prcomp(gratitude.new, center = TRUE, scale = TRUE)
gr.idx.1 = gr.pca$x[,1]
gr.idx.2 = gr.pca$x[,2]
final.analytic.data = new.demographics # rename the data frame
final.analytic.data$sc.idx.1=sc.idx.1 # add the new variable
# to the above demographics data set
final.analytic.data$sc.idx.2=sc.idx.2
final.analytic.data$gr.idx.1=gr.idx.1
final.analytic.data$gr.idx.2=gr.idx.2
# final.analytic.data
## wrote the final analytic data frame to local drive in a csv format.
write.csv(final.analytic.data,'w09-analytic-data-Version02.csv')
First we will use all 18 variables to estimate the first self compassion index.
library(kableExtra)
## Warning in !is.null(rmarkdown::metadata$output) && rmarkdown::metadata$output
## %in% : 'length(x) = 3 > 1' in coercion to 'logical(1)'
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
full.model = lm(sc.idx.1 ~ ., data = final.analytic.data)
kable(summary(full.model)$coef, caption ="Statistics of Regression Coefficients")
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 0.4598924 | 1.4776601 | 0.3112302 | 0.7564667 |
| genderFemale | 0.6444900 | 1.0434856 | 0.6176319 | 0.5386404 |
| genderOther | 0.9796043 | 1.8962028 | 0.5166137 | 0.6069065 |
| raceAfrican American/Black | 0.6228372 | 1.0790247 | 0.5772224 | 0.5654732 |
| raceHispanic | 0.4204108 | 1.1865344 | 0.3543183 | 0.7240689 |
| marital.stMarried/Civil Partner | -0.4359647 | 0.7313594 | -0.5961019 | 0.5528555 |
| marital.stOther | 0.1930816 | 0.6610344 | 0.2920901 | 0.7710033 |
| disabilityNo | -0.4780631 | 0.6139675 | -0.7786456 | 0.4385748 |
| religionNon-Religous | -0.0843421 | 0.4905246 | -0.1719427 | 0.8639337 |
| sexual.orientStraight | -2.4886538 | 2.0282033 | -1.2270238 | 0.2235521 |
| poli.affilIndependent | -0.7976019 | 0.8802812 | -0.9060763 | 0.3677221 |
| poli.affilDemocrat | -0.7065847 | 0.8697243 | -0.8124238 | 0.4190532 |
| SW.programGraduate | 0.1804288 | 0.6523636 | 0.2765771 | 0.7828459 |
| urbanityRural | 0.6408153 | 0.6939948 | 0.9233720 | 0.3586979 |
| urbanitySuburban | 0.2458962 | 0.6424770 | 0.3827315 | 0.7029733 |
| grp.age[24,30] | 0.5659126 | 0.6265544 | 0.9032138 | 0.3692294 |
| grp.age[30,99] | -0.4291088 | 0.9341763 | -0.4593446 | 0.6472809 |
| grp.eduBachelor | -0.0884412 | 0.6091637 | -0.1451846 | 0.8849445 |
| grp.eduAdv.deg | 0.9778831 | 1.0008169 | 0.9770849 | 0.3315869 |
| grp.empljunior | -0.0666920 | 0.6054622 | -0.1101506 | 0.9125765 |
| grp.emplsenior | 0.1826155 | 0.7763772 | 0.2352149 | 0.8146663 |
| kid.numWith-kid | -0.4196830 | 0.7140495 | -0.5877506 | 0.5584195 |
| home.size3+ | -0.8073535 | 0.5173013 | -1.5607027 | 0.1226948 |
| spiritualitylow | 0.2142551 | 0.5119587 | 0.4185007 | 0.6767460 |
| sc.idx.2 | 0.0997127 | 0.2120149 | 0.4703099 | 0.6394631 |
| gr.idx.1 | -0.5776063 | 0.1513792 | -3.8156259 | 0.0002726 |
| gr.idx.2 | -0.2377947 | 0.2766674 | -0.8594966 | 0.3927346 |
We see that nearly every variable is not significant enough for the model. Next, we will use a manual model to best estimate what can estimate a self compassion score.
For the manual model, variables that often associate with self compassion are: spirituality, and religion.
manual.model = lm(sc.idx.1 ~ spirituality + religion, data = final.analytic.data)
kable(summary(manual.model)$coef, caption ="Statistics of Regression Coefficients")
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | -0.2568135 | 0.4338095 | -0.5919960 | 0.5551762 |
| spiritualitylow | 0.9117106 | 0.4484003 | 2.0332515 | 0.0446502 |
| religionNon-Religous | -0.1246212 | 0.4664109 | -0.2671919 | 0.7898662 |
Here we see that spirituality low is significant in predicting self compassion index.
library(MASS)
step.model = stepAIC(full.model,
scope = list(lower=formula(manual.model),upper=formula(full.model)),
direction = "forward", # forward selection
trace = 0 # do not show the details
)
kable(summary(step.model)$coef, caption ="Statistics of Regression Coefficients")
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 0.4598924 | 1.4776601 | 0.3112302 | 0.7564667 |
| genderFemale | 0.6444900 | 1.0434856 | 0.6176319 | 0.5386404 |
| genderOther | 0.9796043 | 1.8962028 | 0.5166137 | 0.6069065 |
| raceAfrican American/Black | 0.6228372 | 1.0790247 | 0.5772224 | 0.5654732 |
| raceHispanic | 0.4204108 | 1.1865344 | 0.3543183 | 0.7240689 |
| marital.stMarried/Civil Partner | -0.4359647 | 0.7313594 | -0.5961019 | 0.5528555 |
| marital.stOther | 0.1930816 | 0.6610344 | 0.2920901 | 0.7710033 |
| disabilityNo | -0.4780631 | 0.6139675 | -0.7786456 | 0.4385748 |
| religionNon-Religous | -0.0843421 | 0.4905246 | -0.1719427 | 0.8639337 |
| sexual.orientStraight | -2.4886538 | 2.0282033 | -1.2270238 | 0.2235521 |
| poli.affilIndependent | -0.7976019 | 0.8802812 | -0.9060763 | 0.3677221 |
| poli.affilDemocrat | -0.7065847 | 0.8697243 | -0.8124238 | 0.4190532 |
| SW.programGraduate | 0.1804288 | 0.6523636 | 0.2765771 | 0.7828459 |
| urbanityRural | 0.6408153 | 0.6939948 | 0.9233720 | 0.3586979 |
| urbanitySuburban | 0.2458962 | 0.6424770 | 0.3827315 | 0.7029733 |
| grp.age[24,30] | 0.5659126 | 0.6265544 | 0.9032138 | 0.3692294 |
| grp.age[30,99] | -0.4291088 | 0.9341763 | -0.4593446 | 0.6472809 |
| grp.eduBachelor | -0.0884412 | 0.6091637 | -0.1451846 | 0.8849445 |
| grp.eduAdv.deg | 0.9778831 | 1.0008169 | 0.9770849 | 0.3315869 |
| grp.empljunior | -0.0666920 | 0.6054622 | -0.1101506 | 0.9125765 |
| grp.emplsenior | 0.1826155 | 0.7763772 | 0.2352149 | 0.8146663 |
| kid.numWith-kid | -0.4196830 | 0.7140495 | -0.5877506 | 0.5584195 |
| home.size3+ | -0.8073535 | 0.5173013 | -1.5607027 | 0.1226948 |
| spiritualitylow | 0.2142551 | 0.5119587 | 0.4185007 | 0.6767460 |
| sc.idx.2 | 0.0997127 | 0.2120149 | 0.4703099 | 0.6394631 |
| gr.idx.1 | -0.5776063 | 0.1513792 | -3.8156259 | 0.0002726 |
| gr.idx.2 | -0.2377947 | 0.2766674 | -0.8594966 | 0.3927346 |
select=function(m){ # m is an object: model
e = m$resid # residuals
n0 = length(e) # sample size
SSE=(m$df)*(summary(m)$sigma)^2 # sum of squared error
R.sq=summary(m)$r.squared # Coefficient of determination: R square!
R.adj=summary(m)$adj.r # Adjusted R square
MSE=(summary(m)$sigma)^2 # square error
Cp=(SSE/MSE)-(n0-2*(n0-m$df)) # Mellow's p
AIC=n0*log(SSE)-n0*log(n0)+2*(n0-m$df) # Akaike information criterion
SBC=n0*log(SSE)-n0*log(n0)+(log(n0))*(n0-m$df) # Schwarz Bayesian Information criterion
X=model.matrix(m) # design matrix of the model
H=X%*%solve(t(X)%*%X)%*%t(X) # hat matrix
d=e/(1-diag(H))
PRESS=t(d)%*%d # predicted residual error sum of squares (PRESS)- a cross-validation measure
tbl = as.data.frame(cbind(SSE=SSE, R.sq=R.sq, R.adj = R.adj, Cp = Cp, AIC = AIC, SBC = SBC, PRD = PRESS))
names(tbl)=c("SSE", "R.sq", "R.adj", "Cp", "AIC", "SBC", "PRESS")
tbl
}
output.sum = rbind(select(full.model), select(manual.model), select(step.model))
row.names(output.sum) = c("full.model", "manual.model", "step.model")
kable(output.sum, caption = "Goodness-of-fit Measures of Candidate Models")
| SSE | R.sq | R.adj | Cp | AIC | SBC | PRESS | |
|---|---|---|---|---|---|---|---|
| full.model | 336.5232 | 0.3401346 | 0.1173230 | 27 | 176.1247 | 247.5233 | 636.7765 |
| manual.model | 488.7804 | 0.0415839 | 0.0226054 | 3 | 166.9423 | 174.8755 | 517.4949 |
| step.model | 336.5232 | 0.3401346 | 0.1173230 | 27 | 176.1247 | 247.5233 | 636.7765 |
We notice that our CP values are different, so we can not compare AIC scores. By looking at the R-squared values, we see that full.model and step.model have the same values. We will choose the full.model because it has every variable.
Although we are using the full.model, it is not a good model to use. It is not good because its value is around 11%, which means that we should do further analysis or surveying to find a better model.