library("quanteda")
library("readtext")
library(dplyr)
library(tidyr)
library(purrr)
library(DT)
#import folder of files
import <- readtext(file="interviewtxt/*.txt") # import all txt files in interviewtxt folder
corpus <- corpus(import) # creates corpus from imported files
summary(corpus)
# add document-level variables (docvars)
# process names
df <- data.frame(names(corpus))
df <- separate(df, col="names.corpus.", into=c("name",NA,"time"), sep="_")
df <- separate(df, col="time", into=c("time",NA), sep=2)
df
tab <- data.frame(table(df$name))
tab$gen <- c("F",
"F",
"M",
"F",
"F",
"M",
"M",
"F",
"F",
"M",
"F",
"F",
"F",
"M",
"M",
"F",
"F",
"M",
"M",
"F",
"M",
"F",
"M",
"M",
"F")
colnames(tab) <- c("name","count","gender")
df <- merge(df, tab, by = "name")
# link names to litho numbers
ids <- read.csv(file="ids.csv", header=F)
colnames(ids) <- c("name","Litho")
# merge docvars and surveydata
df2 <- merge(df,ids,by="name", all.x = T)
surv <- read.csv(file="career_imp.csv", header=T)
# calculate selected factors
surv$engpc <- (surv$Q3Eng_m + surv$Q3Eng_n + surv$Q3Eng_k)/3
surv$engint <- (surv$Q3Eng_i + surv$Q3Eng_h + surv$Q3Eng_j)/3
surv$engrec <- (surv$Q3Eng_f + surv$Q3Eng_e + surv$Q3Eng_d + surv$Q3Eng_g)/4
surv$belong1 <- (surv$Q4a + surv$Q4b + surv$Q4c + surv$Q4g + surv$Q4h)/5
surv$belong2 <- (surv$Q4e + surv$Q4f)/2
surv$engbel <- (surv$Q5d + surv$Q5h + surv$Q5g + surv$Q5e + surv$Q5b)/5
surv$engemp <- (surv$Q5a + surv$Q5c + surv$Q5f)/3
# subset needed columns
surv2 <- subset(surv, select=c(1,221:228,229:237,319:327,328:335,377:383))
head(surv2)
# calculate demographics
surv2$racenum <- as.numeric(surv2$Q30a) + as.numeric(surv2$Q30b) + as.numeric(surv2$Q30c) + as.numeric(surv2$Q30d) + as.numeric(surv2$Q30e) + as.numeric(surv2$Q30f) + as.numeric(surv2$Q30g) + as.numeric(surv2$Q30h)
surv2$raceeth[surv2$racenum == 0] <- NA
surv2$raceeth[surv2$racenum == 2] <- "Biracial"
surv2$raceeth[surv2$racenum > 2] <- "Multiracial"
surv2$raceeth[surv2$racenum == 1 & surv2$Q30a == 1] <- "Asian"
surv2$raceeth[surv2$racenum == 1 & surv2$Q30b == 1] <- "Black/AA"
surv2$raceeth[surv2$racenum == 1 & surv2$Q30c == 1] <- "Latinx"
surv2$raceeth[surv2$racenum == 1 & surv2$Q30d == 1] <- "MidEast"
surv2$raceeth[surv2$racenum == 1 & surv2$Q30e == 1] <- "NH/PI"
surv2$raceeth[surv2$racenum == 1 & surv2$Q30f == 1] <- "NA/AN"
surv2$raceeth[surv2$racenum == 1 & surv2$Q30g == 1] <- "White"
surv2$raceeth[surv2$racenum == 1 & surv2$Q30h == 1] <- "WriteIn"
table(surv2$raceeth)
surv2$gennum <- as.numeric(surv2$Q31a) + as.numeric(surv2$Q31b) + as.numeric(surv2$Q31c) + as.numeric(surv2$Q31d) + as.numeric(surv2$Q31e) + as.numeric(surv2$Q31f) + as.numeric(surv2$Q31g)
table(surv2$gennum)
surv2$genid[surv2$gennum == 0] <- NA
surv2$genid[surv2$gennum > 1] <- "Multiple Options Selected"
surv2$genid[surv2$gennum == 1 & surv2$Q31a == 1] <- "Female"
surv2$genid[surv2$gennum == 1 & surv2$Q31b == 1] <- "Male"
surv2$genid[surv2$gennum == 1 & surv2$Q31c == 1] <- "Agender"
surv2$genid[surv2$gennum == 1 & surv2$Q31d == 1] <- "Genderqueer"
surv2$genid[surv2$gennum == 1 & surv2$Q31e == 1] <- "Cisgender"
surv2$genid[surv2$gennum == 1 & surv2$Q31f == 1] <- "Transgender"
surv2$genid[surv2$gennum == 1 & surv2$Q31g == 1] <- "Not Listed"
table(surv2$genid)
subset(surv2, surv2$gennum > 1, select=c(28:35))
# prep survey data for merge
names(surv2)
colnames(surv2)[2:18] <- c("career_money",
"career_known",
"career_helping",
"career_supervising",
"career_security",
"career_people",
"career_invent",
"career_developing",
"field_academia",
"field_industry",
"field_entre",
"field_govt",
"field_k12",
"field_law",
"field_med",
"field_nonprofit",
"field_other")
surv3 <- subset(surv2, select=c(1:18,36:46))
head(surv3)
# merge
df3 <- merge(df2, surv3, by = "Litho", all.x = T)
# no survey data for Allen, Hilda, and Alex
# add vars to corpus
docvars(corpus, "name") <- df3$name
docvars(corpus, "round") <- df3$time
docvars(corpus, "gender") <- df3$gender
docvars(corpus, "litho") <- df3$litho
docvars(corpus, "career_money") <- df3$career_money
docvars(corpus, "career_known") <- df3$career_known
docvars(corpus, "career_helping") <- df3$career_helping
docvars(corpus, "career_supervising") <- df3$career_supervising
docvars(corpus, "career_security") <- df3$career_security
docvars(corpus, "career_people") <- df3$career_people
docvars(corpus, "career_invent") <- df3$career_invent
docvars(corpus, "career_developing") <- df3$career_developing
docvars(corpus, "field_academia") <- df3$field_academia
docvars(corpus, "field_industry") <- df3$field_industry
docvars(corpus, "field_entre") <- df3$field_entre
docvars(corpus, "field_govt") <- df3$field_govt
docvars(corpus, "field_k12") <- df3$field_k12
docvars(corpus, "field_law") <- df3$field_law
docvars(corpus, "field_med") <- df3$field_med
docvars(corpus, "field_nonprofit") <- df3$field_nonprofit
docvars(corpus, "engpc") <- df3$engpc
docvars(corpus, "engint") <- df3$engint
docvars(corpus, "engbel") <- df3$engbel
docvars(corpus, "engemp") <- df3$engemp
docvars(corpus, "engrec") <- df3$engrec
docvars(corpus, "belong1") <- df3$belong1
docvars(corpus, "belong2") <- df3$belong2
docvars(corpus, "raceeth") <- df3$raceeth
docvars(corpus, "genid") <- df3$genid
summary(corpus)
head(df3)
rm(tab,df,df2,ids, df3, surv, surv2)
# import LIWC output
liwc_out <- read.delim(file="LIWC_output.txt", header=T)
# add LIWC vars to corpus
summ <- data.frame(summary(corpus)[1])
colnames(summ) <- "Filename"
df <- merge(summ, liwc_out, by = "Filename")
n <- 1
for (i in 1:95) {
docvars(corpus, colnames(df[n])) <- df[n]
n <- n + 1
}
rm(i, n, summ)
Regressions
- Drives – an overarching dimension that captures the following needs, motives, drives:
- Affiliation – McClelland-like dimensions including reference to others
- Achievement – references to success and failure, achievement striving
- Power – references relevant to status, dominance, social hierarchies
- Reward focus – references to rewards, incentives, positive goals, approach
- Risk focus – references to dangers, concerns, things to avoid
df <- summary(corpus)
# only IVs
df2 <- subset(df, select=c(44:127))
# collapse across rounds
agg <- aggregate(df2,by=list(df$name), FUN=mean)
tab <- subset(data.frame(table(df$name, df$gender)), Freq != 0)
df3 <- data.frame(cbind(agg$Group.1, tab$Var2, agg[2:85]))
colnames(df3)[1:2] <- c("name","gender")
dvs <- aggregate(df[24:31],by=list(df$name), FUN=mean)
dvs$id <- (dvs$engpc + dvs$engint + dvs$engbel) / 3
dvs2 <- subset(dvs, select=c(4,5,7,8,10))
library(pwr)
pwr.f2.test(u =6, v=19, f2=0.4/(1-.4), sig.level =0.075)
##
## Multiple regression power calculation
##
## u = 6
## v = 19
## f2 = 0.6666667
## sig.level = 0.075
## power = 0.8277789
Eng Agency Beliefs
- Engineering can improve quality of life.
- Engineering knowledge is for the advancement of human welfare.
- Engineering can improve the quality of life in my community.
- Engineering can be a resource for my community.
- Engineering can improve our society.
library(sjPlot)
library(jtools)
dvars <- c("Beliefs","Empowerment","SubBelonging","IntBelonging","Identity")
ivars <- c("Affiliation","Achievement","Power","Reward","Risk")
reg <- data.frame(cbind(dvs2$engbel, df3$affiliation, df3$achieve, df3$power, df3$reward, df3$risk))
colnames(reg)[2:6] <- ivars
reg <- lm(X1 ~ ., data=reg)
summary(reg, vifs = T)
##
## Call:
## lm(formula = X1 ~ ., data = reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0007 -0.2552 0.1280 0.2903 1.0012
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.68045 1.44947 4.609 0.00025 ***
## Affiliation -0.01034 0.28226 -0.037 0.97120
## Achievement -0.26332 0.53875 -0.489 0.63126
## Power -1.08807 0.44751 -2.431 0.02639 *
## Reward 0.03099 0.59041 0.052 0.95875
## Risk 4.76841 1.26357 3.774 0.00151 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5654 on 17 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.471, Adjusted R-squared: 0.3154
## F-statistic: 3.028 on 5 and 17 DF, p-value: 0.03913
effect_plot(reg, pred = Power, interval = TRUE, plot.points = TRUE, y.label = "Engineering Agency Beliefs")

effect_plot(reg, pred = Risk, interval = TRUE, plot.points = TRUE, y.label = "Engineering Agency Beliefs")

Eng Empowerment Beliefs
- I can make changes in my community with engineering.
- Engineering will give me the tools and resources to make an impact in my community.
- I can make an impact in peoples’ lives through engineering.
reg <- data.frame(cbind(dvs2$engemp, df3$affiliation, df3$achieve, df3$power, df3$reward, df3$risk))
colnames(reg)[2:6] <- ivars
reg <- lm(X1 ~ ., data=reg)
summary(reg, vifs = T)
##
## Call:
## lm(formula = X1 ~ ., data = reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8649 -0.5153 0.1645 0.3828 0.7481
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.2839 1.4188 5.134 8.29e-05 ***
## Affiliation 0.1760 0.2763 0.637 0.5326
## Achievement -0.8094 0.5273 -1.535 0.1432
## Power -0.9660 0.4380 -2.205 0.0415 *
## Reward -0.4559 0.5779 -0.789 0.4411
## Risk 5.3006 1.2368 4.286 0.0005 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5535 on 17 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.5276, Adjusted R-squared: 0.3887
## F-statistic: 3.798 on 5 and 17 DF, p-value: 0.01714
effect_plot(reg, pred = Power, interval = TRUE, plot.points = TRUE, y.label = "Engineering Empowerment")

effect_plot(reg, pred = Risk, interval = TRUE, plot.points = TRUE, y.label = "Engineering Empowerment")

Belonging (Subject)
- I feel comfortable in engineering.
- I feel I belong in engineering.
- I enjoy being in engineering.
- I feel committed to engineering.
- I feel sure about my choice of engineering as a major.
reg <- data.frame(cbind(dvs2$belong1, df3$affiliation, df3$achieve, df3$power, df3$reward, df3$risk))
colnames(reg)[2:6] <- ivars
reg <- lm(X1 ~ ., data=reg)
summary(reg, vifs = T)
##
## Call:
## lm(formula = X1 ~ ., data = reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.41101 -0.34291 0.06116 0.49097 1.05926
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.1274 1.8305 4.440 0.000359 ***
## Affiliation 0.2241 0.3565 0.629 0.537876
## Achievement -0.8071 0.6804 -1.186 0.251845
## Power -0.8289 0.5652 -1.467 0.160732
## Reward -1.3363 0.7456 -1.792 0.090914 .
## Risk 5.0685 1.5958 3.176 0.005523 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7141 on 17 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.4202, Adjusted R-squared: 0.2496
## F-statistic: 2.464 on 5 and 17 DF, p-value: 0.07468
effect_plot(reg, pred = Reward, interval = TRUE, plot.points = TRUE, x.label = "Reward", y.label = "Belonging 1")

effect_plot(reg, pred = Risk, interval = TRUE, plot.points = TRUE, x.label = "Risk", y.label = "Belonging 2")

Belonging (Interpersonal)
- I feel supported in my engineering class.
- I feel that I am part of my engineering class.
reg <- data.frame(cbind(dvs2$belong2, df3$affiliation, df3$achieve, df3$power, df3$reward, df3$risk))
colnames(reg)[2:6] <- ivars
reg <- lm(X1 ~ ., data=reg)
summary(reg, vifs = T)
##
## Call:
## lm(formula = X1 ~ ., data = reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.34279 -0.50273 -0.07541 0.47278 1.04765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.2975 1.8861 3.869 0.00123 **
## Affiliation 0.6906 0.3673 1.880 0.07733 .
## Achievement -1.2157 0.7010 -1.734 0.10099
## Power -1.0736 0.5823 -1.844 0.08273 .
## Reward -1.0537 0.7683 -1.372 0.18804
## Risk 6.0847 1.6442 3.701 0.00178 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7358 on 17 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.4918, Adjusted R-squared: 0.3423
## F-statistic: 3.29 on 5 and 17 DF, p-value: 0.02931
effect_plot(reg, pred = Affiliation, interval = TRUE, plot.points = TRUE, x.label = "Affiliation", y.label = "Belonging 2")

effect_plot(reg, pred = Power, interval = TRUE, plot.points = TRUE, x.label = "Power", y.label = "Belonging 2")

effect_plot(reg, pred = Risk, interval = TRUE, plot.points = TRUE, x.label = "Risk", y.label = "Belonging 2")

Identity
- I can do well on exams in engineering.
- I understand concepts I have studied in engineering.
- I am confident that I can understand engineering in class.
- I enjoy learning engineering.
- I am interested in learning more about engineering.
- I find fulfillment in doing engineering.
- My peers see me as an engineer.
- My instructors see me as an engineer.
- My parents see me as an engineer.
- I’ve had experiences in which I was recognized as an engineer.
reg <- data.frame(cbind(dvs2$id, df3$affiliation, df3$achieve, df3$power, df3$reward, df3$risk))
colnames(reg)[2:6] <- ivars
reg <- lm(X1 ~ ., data=reg)
summary(reg, vifs = T)
##
## Call:
## lm(formula = X1 ~ ., data = reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9092 -0.3968 0.1275 0.3579 0.8758
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.6999 1.3933 4.809 0.000164 ***
## Affiliation 0.1888 0.2713 0.696 0.495935
## Achievement -0.1163 0.5179 -0.225 0.824946
## Power -0.8553 0.4302 -1.988 0.063127 .
## Reward -0.5843 0.5675 -1.029 0.317680
## Risk 3.3553 1.2146 2.762 0.013315 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5435 on 17 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3404, Adjusted R-squared: 0.1464
## F-statistic: 1.755 on 5 and 17 DF, p-value: 0.1762
effect_plot(reg, pred = Power, interval = TRUE, plot.points = TRUE, x.label = "Power", y.label = "Identity")

effect_plot(reg, pred = Risk, interval = TRUE, plot.points = TRUE, x.label = "Risk", y.label = "Identity")

Examining Word & Phrases
pow <- data.frame(df3$name, df3$power)
# pow[order(pow$df3.power),]
removeWords <- function(str, stopwords) {
x <- unlist(strsplit(str, " "))
paste(x[!x %in% stopwords])
}
tchuck1 <- read.csv(file="tchuck1.csv", header=T)
tchuck2 <- read.csv(file="tchuck2.csv", header=T)
tchuck3 <- read.csv(file="tchuck3.csv", header=T)
t1 <- subset(tchuck1, select=c("Word","power"), power == "X")
t2 <- subset(tchuck2, select=c("Word","power"), power == "X")
t3 <- subset(tchuck3, select=c("Word","power"), power == "X")
powerdf <- data.frame(rbind(t1, t2, t3))
powerdf2 <- unique(as.character(powerdf$Word))
stopwords <- c("up")
powerdf3 <- removeWords(powerdf2, stopwords)
library(quanteda)
dfm_corp <- dfm(corpus, groups = "gender")
dfm_corp2 <- dfm_keep(dfm_corp, pattern = powerdf3)
col <- sapply(seq(0.1, 1, 0.1), function(x) adjustcolor("#1F78B4", x))
textplot_wordcloud(dfm_corp2,
adjust = 0.5, random_order = FALSE,
color = col, rotation = FALSE,
min_size = 1, max_size = 4)

# exploring corpus text -- search for a word and view the context
kwic_test <- kwic(corpus, pattern = "up", window = 10)
datatable(kwic_test)
textplot_wordcloud(dfm_corp2, comparison = TRUE,
color = c("blue", "red"), rotation = FALSE,
min_size = 1, max_size = 6)
