library("quanteda")
library("readtext")
library(dplyr)
library(tidyr)
library(purrr)
library(DT)

#import folder of files
import <- readtext(file="interviewtxt/*.txt") # import all txt files in interviewtxt folder
corpus <- corpus(import) # creates corpus from imported files
summary(corpus)

# add document-level variables (docvars)
# process names
df <- data.frame(names(corpus))
df <- separate(df, col="names.corpus.", into=c("name",NA,"time"), sep="_")
df <- separate(df, col="time", into=c("time",NA), sep=2)
df

tab <- data.frame(table(df$name))
tab$gen <- c("F",
             "F",
             "M",
             "F",
             "F",
             "M",
             "M",
             "F",
             "F",
             "M",
             "F",
             "F",
             "F",
             "M",
             "M",
             "F",
             "F",
             "M",
             "M",
             "F",
             "M",
             "F",
             "M",
             "M",
             "F")
colnames(tab) <- c("name","count","gender")
df <- merge(df, tab, by = "name")


# link names to litho numbers
ids <- read.csv(file="ids.csv", header=F)
colnames(ids) <- c("name","Litho")

# merge docvars and surveydata
df2 <- merge(df,ids,by="name", all.x = T)
surv <- read.csv(file="career_imp.csv", header=T)

  # calculate selected factors
  surv$engpc <- (surv$Q3Eng_m + surv$Q3Eng_n + surv$Q3Eng_k)/3
  surv$engint <- (surv$Q3Eng_i + surv$Q3Eng_h + surv$Q3Eng_j)/3
  surv$engrec <- (surv$Q3Eng_f + surv$Q3Eng_e + surv$Q3Eng_d + surv$Q3Eng_g)/4
  surv$belong1 <- (surv$Q4a + surv$Q4b + surv$Q4c + surv$Q4g + surv$Q4h)/5
  surv$belong2 <- (surv$Q4e + surv$Q4f)/2
  surv$engbel <- (surv$Q5d + surv$Q5h + surv$Q5g + surv$Q5e + surv$Q5b)/5
  surv$engemp <- (surv$Q5a + surv$Q5c + surv$Q5f)/3

  # subset needed columns
  surv2 <- subset(surv, select=c(1,221:228,229:237,319:327,328:335,377:383))
  head(surv2)

  # calculate demographics
  surv2$racenum <- as.numeric(surv2$Q30a) + as.numeric(surv2$Q30b) + as.numeric(surv2$Q30c) + as.numeric(surv2$Q30d) + as.numeric(surv2$Q30e) + as.numeric(surv2$Q30f) + as.numeric(surv2$Q30g) + as.numeric(surv2$Q30h)
  surv2$raceeth[surv2$racenum == 0] <- NA
  surv2$raceeth[surv2$racenum == 2] <- "Biracial"
  surv2$raceeth[surv2$racenum > 2] <- "Multiracial"
  surv2$raceeth[surv2$racenum == 1 & surv2$Q30a == 1] <- "Asian"
  surv2$raceeth[surv2$racenum == 1 & surv2$Q30b == 1] <- "Black/AA"
  surv2$raceeth[surv2$racenum == 1 & surv2$Q30c == 1] <- "Latinx"
  surv2$raceeth[surv2$racenum == 1 & surv2$Q30d == 1] <- "MidEast"
  surv2$raceeth[surv2$racenum == 1 & surv2$Q30e == 1] <- "NH/PI"
  surv2$raceeth[surv2$racenum == 1 & surv2$Q30f == 1] <- "NA/AN"
  surv2$raceeth[surv2$racenum == 1 & surv2$Q30g == 1] <- "White"
  surv2$raceeth[surv2$racenum == 1 & surv2$Q30h == 1] <- "WriteIn"
  table(surv2$raceeth)
  
  surv2$gennum <- as.numeric(surv2$Q31a) + as.numeric(surv2$Q31b) + as.numeric(surv2$Q31c) + as.numeric(surv2$Q31d) + as.numeric(surv2$Q31e) + as.numeric(surv2$Q31f) + as.numeric(surv2$Q31g)
  table(surv2$gennum)
  surv2$genid[surv2$gennum == 0] <- NA
  surv2$genid[surv2$gennum > 1] <- "Multiple Options Selected"
  surv2$genid[surv2$gennum == 1 & surv2$Q31a == 1] <- "Female"
  surv2$genid[surv2$gennum == 1 & surv2$Q31b == 1] <- "Male"
  surv2$genid[surv2$gennum == 1 & surv2$Q31c == 1] <- "Agender"
  surv2$genid[surv2$gennum == 1 & surv2$Q31d == 1] <- "Genderqueer"
  surv2$genid[surv2$gennum == 1 & surv2$Q31e == 1] <- "Cisgender"
  surv2$genid[surv2$gennum == 1 & surv2$Q31f == 1] <- "Transgender"
  surv2$genid[surv2$gennum == 1 & surv2$Q31g == 1] <- "Not Listed"
  table(surv2$genid)
  subset(surv2, surv2$gennum > 1, select=c(28:35))

  # prep survey data for merge
  names(surv2)
  colnames(surv2)[2:18] <- c("career_money",
                       "career_known",
                       "career_helping",
                       "career_supervising",
                       "career_security",
                       "career_people",
                       "career_invent",
                       "career_developing",
                       "field_academia",
                       "field_industry",
                       "field_entre",
                       "field_govt",
                       "field_k12",
                       "field_law",
                       "field_med",
                       "field_nonprofit",
                       "field_other")
  surv3 <- subset(surv2, select=c(1:18,36:46))
  head(surv3)
  
  # merge
  df3 <- merge(df2, surv3, by = "Litho", all.x = T)
  # no survey data for Allen, Hilda, and Alex

# add vars to corpus
docvars(corpus, "name") <- df3$name
docvars(corpus, "round") <- df3$time
docvars(corpus, "gender") <- df3$gender
docvars(corpus, "litho") <- df3$litho
docvars(corpus, "career_money") <- df3$career_money
docvars(corpus, "career_known") <- df3$career_known
docvars(corpus, "career_helping") <- df3$career_helping
docvars(corpus, "career_supervising") <- df3$career_supervising
docvars(corpus, "career_security") <- df3$career_security
docvars(corpus, "career_people") <- df3$career_people
docvars(corpus, "career_invent") <- df3$career_invent
docvars(corpus, "career_developing") <- df3$career_developing
docvars(corpus, "field_academia") <- df3$field_academia
docvars(corpus, "field_industry") <- df3$field_industry
docvars(corpus, "field_entre") <- df3$field_entre
docvars(corpus, "field_govt") <- df3$field_govt
docvars(corpus, "field_k12") <- df3$field_k12
docvars(corpus, "field_law") <- df3$field_law
docvars(corpus, "field_med") <- df3$field_med
docvars(corpus, "field_nonprofit") <- df3$field_nonprofit
docvars(corpus, "engpc") <- df3$engpc
docvars(corpus, "engint") <- df3$engint
docvars(corpus, "engbel") <- df3$engbel
docvars(corpus, "engemp") <- df3$engemp
docvars(corpus, "engrec") <- df3$engrec
docvars(corpus, "belong1") <- df3$belong1
docvars(corpus, "belong2") <- df3$belong2
docvars(corpus, "raceeth") <- df3$raceeth
docvars(corpus, "genid") <- df3$genid
summary(corpus)

head(df3)

rm(tab,df,df2,ids, df3, surv, surv2)

# import LIWC output
liwc_out <- read.delim(file="LIWC_output.txt", header=T)

# add LIWC vars to corpus
summ <- data.frame(summary(corpus)[1])
colnames(summ) <- "Filename"
df <- merge(summ, liwc_out, by = "Filename")

n <- 1
for (i in 1:95) {
  docvars(corpus, colnames(df[n])) <- df[n]
  n <- n + 1
}

rm(i, n, summ)

Regressions

df <- summary(corpus)
# only IVs
df2 <- subset(df, select=c(44:127))
# collapse across rounds
agg <- aggregate(df2,by=list(df$name), FUN=mean)
tab <- subset(data.frame(table(df$name, df$gender)), Freq != 0)
df3 <- data.frame(cbind(agg$Group.1, tab$Var2, agg[2:85]))
colnames(df3)[1:2] <- c("name","gender")

dvs <- aggregate(df[24:31],by=list(df$name), FUN=mean)
dvs$id <- (dvs$engpc + dvs$engint + dvs$engbel) / 3
dvs2 <- subset(dvs, select=c(4,5,7,8,10))

library(pwr)
pwr.f2.test(u =6, v=19, f2=0.4/(1-.4), sig.level =0.075)
## 
##      Multiple regression power calculation 
## 
##               u = 6
##               v = 19
##              f2 = 0.6666667
##       sig.level = 0.075
##           power = 0.8277789

Eng Agency Beliefs

  • Engineering can improve quality of life.
  • Engineering knowledge is for the advancement of human welfare.
  • Engineering can improve the quality of life in my community.
  • Engineering can be a resource for my community.
  • Engineering can improve our society.
library(sjPlot)
library(jtools)

dvars <- c("Beliefs","Empowerment","SubBelonging","IntBelonging","Identity")
ivars <- c("Affiliation","Achievement","Power","Reward","Risk")

reg <- data.frame(cbind(dvs2$engbel, df3$affiliation, df3$achieve, df3$power, df3$reward, df3$risk))
colnames(reg)[2:6] <- ivars
reg <- lm(X1 ~ ., data=reg)
summary(reg, vifs = T)
## 
## Call:
## lm(formula = X1 ~ ., data = reg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0007 -0.2552  0.1280  0.2903  1.0012 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.68045    1.44947   4.609  0.00025 ***
## Affiliation -0.01034    0.28226  -0.037  0.97120    
## Achievement -0.26332    0.53875  -0.489  0.63126    
## Power       -1.08807    0.44751  -2.431  0.02639 *  
## Reward       0.03099    0.59041   0.052  0.95875    
## Risk         4.76841    1.26357   3.774  0.00151 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5654 on 17 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.471,  Adjusted R-squared:  0.3154 
## F-statistic: 3.028 on 5 and 17 DF,  p-value: 0.03913
effect_plot(reg, pred = Power, interval = TRUE, plot.points = TRUE, y.label = "Engineering Agency Beliefs")

effect_plot(reg, pred = Risk, interval = TRUE, plot.points = TRUE, y.label = "Engineering Agency Beliefs")

Eng Empowerment Beliefs

  • I can make changes in my community with engineering.
  • Engineering will give me the tools and resources to make an impact in my community.
  • I can make an impact in peoples’ lives through engineering.
reg <- data.frame(cbind(dvs2$engemp, df3$affiliation, df3$achieve, df3$power, df3$reward, df3$risk))
colnames(reg)[2:6] <- ivars
reg <- lm(X1 ~ ., data=reg)
summary(reg, vifs = T)
## 
## Call:
## lm(formula = X1 ~ ., data = reg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8649 -0.5153  0.1645  0.3828  0.7481 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.2839     1.4188   5.134 8.29e-05 ***
## Affiliation   0.1760     0.2763   0.637   0.5326    
## Achievement  -0.8094     0.5273  -1.535   0.1432    
## Power        -0.9660     0.4380  -2.205   0.0415 *  
## Reward       -0.4559     0.5779  -0.789   0.4411    
## Risk          5.3006     1.2368   4.286   0.0005 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5535 on 17 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.5276, Adjusted R-squared:  0.3887 
## F-statistic: 3.798 on 5 and 17 DF,  p-value: 0.01714
effect_plot(reg, pred = Power, interval = TRUE, plot.points = TRUE, y.label = "Engineering Empowerment")

effect_plot(reg, pred = Risk, interval = TRUE, plot.points = TRUE, y.label = "Engineering Empowerment")

Belonging (Subject)

  • I feel comfortable in engineering.
  • I feel I belong in engineering.
  • I enjoy being in engineering.
  • I feel committed to engineering.
  • I feel sure about my choice of engineering as a major.
reg <- data.frame(cbind(dvs2$belong1, df3$affiliation, df3$achieve, df3$power, df3$reward, df3$risk))
colnames(reg)[2:6] <- ivars
reg <- lm(X1 ~ ., data=reg)
summary(reg, vifs = T)
## 
## Call:
## lm(formula = X1 ~ ., data = reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.41101 -0.34291  0.06116  0.49097  1.05926 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.1274     1.8305   4.440 0.000359 ***
## Affiliation   0.2241     0.3565   0.629 0.537876    
## Achievement  -0.8071     0.6804  -1.186 0.251845    
## Power        -0.8289     0.5652  -1.467 0.160732    
## Reward       -1.3363     0.7456  -1.792 0.090914 .  
## Risk          5.0685     1.5958   3.176 0.005523 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7141 on 17 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.4202, Adjusted R-squared:  0.2496 
## F-statistic: 2.464 on 5 and 17 DF,  p-value: 0.07468
effect_plot(reg, pred = Reward, interval = TRUE, plot.points = TRUE, x.label = "Reward", y.label = "Belonging 1")

effect_plot(reg, pred = Risk, interval = TRUE, plot.points = TRUE, x.label = "Risk", y.label = "Belonging 2")

Belonging (Interpersonal)

  • I feel supported in my engineering class.
  • I feel that I am part of my engineering class.
reg <- data.frame(cbind(dvs2$belong2, df3$affiliation, df3$achieve, df3$power, df3$reward, df3$risk))
colnames(reg)[2:6] <- ivars
reg <- lm(X1 ~ ., data=reg)
summary(reg, vifs = T)
## 
## Call:
## lm(formula = X1 ~ ., data = reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.34279 -0.50273 -0.07541  0.47278  1.04765 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   7.2975     1.8861   3.869  0.00123 **
## Affiliation   0.6906     0.3673   1.880  0.07733 . 
## Achievement  -1.2157     0.7010  -1.734  0.10099   
## Power        -1.0736     0.5823  -1.844  0.08273 . 
## Reward       -1.0537     0.7683  -1.372  0.18804   
## Risk          6.0847     1.6442   3.701  0.00178 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7358 on 17 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.4918, Adjusted R-squared:  0.3423 
## F-statistic:  3.29 on 5 and 17 DF,  p-value: 0.02931
effect_plot(reg, pred = Affiliation, interval = TRUE, plot.points = TRUE, x.label = "Affiliation", y.label = "Belonging 2")

effect_plot(reg, pred = Power, interval = TRUE, plot.points = TRUE, x.label = "Power", y.label = "Belonging 2")

effect_plot(reg, pred = Risk, interval = TRUE, plot.points = TRUE, x.label = "Risk", y.label = "Belonging 2")

Identity

  • I can do well on exams in engineering.
  • I understand concepts I have studied in engineering.
  • I am confident that I can understand engineering in class.
  • I enjoy learning engineering.
  • I am interested in learning more about engineering.
  • I find fulfillment in doing engineering.
  • My peers see me as an engineer.
  • My instructors see me as an engineer.
  • My parents see me as an engineer.
  • I’ve had experiences in which I was recognized as an engineer.
reg <- data.frame(cbind(dvs2$id, df3$affiliation, df3$achieve, df3$power, df3$reward, df3$risk))
colnames(reg)[2:6] <- ivars
reg <- lm(X1 ~ ., data=reg)
summary(reg, vifs = T)
## 
## Call:
## lm(formula = X1 ~ ., data = reg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9092 -0.3968  0.1275  0.3579  0.8758 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.6999     1.3933   4.809 0.000164 ***
## Affiliation   0.1888     0.2713   0.696 0.495935    
## Achievement  -0.1163     0.5179  -0.225 0.824946    
## Power        -0.8553     0.4302  -1.988 0.063127 .  
## Reward       -0.5843     0.5675  -1.029 0.317680    
## Risk          3.3553     1.2146   2.762 0.013315 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5435 on 17 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3404, Adjusted R-squared:  0.1464 
## F-statistic: 1.755 on 5 and 17 DF,  p-value: 0.1762
effect_plot(reg, pred = Power, interval = TRUE, plot.points = TRUE, x.label = "Power", y.label = "Identity")

effect_plot(reg, pred = Risk, interval = TRUE, plot.points = TRUE, x.label = "Risk", y.label = "Identity")

Examining Word & Phrases

pow <- data.frame(df3$name, df3$power)
# pow[order(pow$df3.power),]

removeWords <- function(str, stopwords) {
  x <- unlist(strsplit(str, " "))
  paste(x[!x %in% stopwords])
}

tchuck1 <- read.csv(file="tchuck1.csv", header=T)
tchuck2 <- read.csv(file="tchuck2.csv", header=T)
tchuck3 <- read.csv(file="tchuck3.csv", header=T)

t1 <- subset(tchuck1, select=c("Word","power"), power == "X")
t2 <- subset(tchuck2, select=c("Word","power"), power == "X")
t3 <- subset(tchuck3, select=c("Word","power"), power == "X")

powerdf <- data.frame(rbind(t1, t2, t3))
powerdf2 <- unique(as.character(powerdf$Word))
stopwords <- c("up")
powerdf3 <- removeWords(powerdf2, stopwords)

library(quanteda)

dfm_corp <- dfm(corpus, groups = "gender")
dfm_corp2 <- dfm_keep(dfm_corp, pattern = powerdf3)

col <- sapply(seq(0.1, 1, 0.1), function(x) adjustcolor("#1F78B4", x))
textplot_wordcloud(dfm_corp2,
                   adjust = 0.5, random_order = FALSE,
                   color = col, rotation = FALSE,
                   min_size = 1, max_size = 4)

# exploring corpus text -- search for a word and view the context
kwic_test <- kwic(corpus, pattern = "up", window = 10)
datatable(kwic_test)
textplot_wordcloud(dfm_corp2, comparison = TRUE,
                   color = c("blue", "red"), rotation = FALSE,
                   min_size = 1, max_size = 6)