The goal of this R script is to extract table information from 38 word docs, in most cases column 1 and 6, for the anomaly files 1 and 5.
library(officer) #load word doc
library(janitor)
library(ggtext)
library(tidyverse)
library(data.table)
library(knitr)
# Extract Colume Info from Single Word Doc
#
# setwd("/Users/n01635376/OneDrive - Humber College/Desktop/PPDR/FAST")
# doc1<- read_docx('ITS-ESDV_AP_2022.docx')
# content_tbl1<- docx_summary(doc1) %>% as_tibble()
#
# #extract first column content
# first_col1 <- content_tbl1 %>%
# filter(cell_id == 5) %>%
# pull(text)
#
# goals_ITS <- as.tibble(first_col1)
# goals_ITS <- as.data.frame(goals_ITS)
#
# goals_ITS<-setnames(goals_ITS, old = c('value'), new = c('Actions'))
# 2 programs have "intended goals & outcomes stored in col 5 rather than 6
# this was discovered after running the block below
# these anomaly files are extracted and rbind with the main "goals" file
# Get all doc file names
doc_files1 <- dir(file_path1, pattern = "\\.docx$", full.names = TRUE)
# extract intended goals & outcomes
tbls_goal_anomaly <- lapply(
doc_files1,
function(file) {
tbl <- read_docx(file)
file_names <- basename(file)
docx_summary(tbl) %>% as_tibble() %>% filter(cell_id == 5) %>% pull(text) %>%
as.tibble() %>% mutate(Program = basename(file)) #program from file name
}
)
goals_anomaly <- data.frame(do.call(rbind, tbls_goal_anomaly))
names(goals_anomaly)[names(goals_anomaly) == 'value'] <- 'Intended_Goals'
goals_anomaly$Program <- sub("_AP_2022.docx.*", "", goals_anomaly$Program)
goals_anomaly$Year <- "2022"
colnames(goals_anomaly)[colnames(goals_anomaly) == "Intended_Goals"] <- "Actions"
goals_anomaly$Col_Name <- "Intended Goals"
# Get all doc file names
doc_files <- dir(file_path, pattern = "\\.docx$", full.names = TRUE)
# get program name from file path
# a<- as.data.frame(doc_files)
# a$Program <- gsub("\\.[^.]*$", "", basename(doc_files))
# a$index <- seq_len(nrow(a))
# extract recommendations and program name
tbls <- lapply(
doc_files,
function(file) {
tbl <- read_docx(file)
file_names <- basename(file)
docx_summary(tbl) %>% as_tibble() %>% filter(cell_id == 1) %>% pull(text) %>%
as.tibble() %>% mutate(Program = basename(file)) #program from file name
}
)
recommendations <- data.frame(do.call(rbind, tbls)) #convert lists to df
names(recommendations)[names(recommendations) == 'value'] <- 'Recommendations'
recommendations$Program <- sub("_AP_2022.docx.*", "", recommendations$Program)
recommendations$Year <- "2022"
# extract intended goals & outcomes
tbls_goal <- lapply(
doc_files,
function(file) {
tbl <- read_docx(file)
file_names <- basename(file)
docx_summary(tbl) %>% as_tibble() %>% filter(cell_id == 6) %>% pull(text) %>%
as.tibble() %>% mutate(Program = basename(file)) #program from file name
}
)
goals <- data.frame(do.call(rbind, tbls_goal))
names(goals)[names(goals) == 'value'] <- 'Intended_Goals'
goals$Program <- sub("_AP_2022.docx.*", "", goals$Program)
goals$Year <- "2022"
colnames(recommendations)[colnames(recommendations) == "Recommendations"] <- "Actions"
recommendations$Col_Name <- "Recommendations"
colnames(goals)[colnames(goals) == "Intended_Goals"] <- "Actions"
goals$Col_Name <- "Intended Goals"
# rbind anomaly with main
goals<-rbind(goals, goals_anomaly)
data_Fast<- rbind(goals, recommendations) # rbind 2 df
data_Fast <- data_Fast[order(data_Fast$Program,data_Fast$Col_Name != "Recommendations"), ] #sort rows
data_Fast$Faculty <- "FAST"
data_Fast<-data_Fast[,c(5, 2:4, 1)] #reorder cols
data_Fast <- data_Fast[complete.cases(data_Fast) & trimws(data_Fast$Actions)!="", ] #remove empty rows
strings_to_exclude <- c("Curriculum Level", "Program Level", "Faculty Level", "Institution Level", "Intended Goals & Outcomes", "Recommendation for Action", "Recommendations for Action", "SUMMARYRECOMMENDATIONS", "Action Plan", "Action Item", "ACADEMIC PROGRAM REVIEW","Measurement of Outcomes")
# Remove rows containing the specified strings in the "Actions" column
data_Fast <- data_Fast[!grepl(paste(strings_to_exclude, collapse = "|"), data_Fast$Actions), ]
a<-data_Fast[,c(2,4)]
a<-unique(a)
unique(a$Program)
## [1] "AI with Machine Learning(Blank)"
## [2] "Architectural Technology"
## [3] "Carpentry and Reno Tech"
## [4] "Carpentry and Reno Techniques (Blank)"
## [5] "Civil Engineering Technology"
## [6] "Cloud Computing_GC"
## [7] "Computer Engineering Technology"
## [8] "Computer Programming"
## [9] "Computer Programming and Analysis"
## [10] "Computer Systems Technician - ITIS"
## [11] "Construction Stackables"
## [12] "Design Foundation_Cert"
## [13] "Electrical Engineering_Dip_ADip"
## [14] "Electrical Techniques_Cert"
## [15] "Electromechanical Engineering"
## [16] "Electronics Engineering Technician"
## [17] "Electronics Engineering Technology"
## [18] "Engineering, Bachelor of_Degree"
## [19] "Furniture and Cabinetmaking Techniques(Blank)"
## [20] "HRAC Technician"
## [21] "HRAC Technology"
## [22] "Industrial Design, Bachelor of"
## [23] "Industrial Woodworking"
## [24] "Industrial Woodworking Technician"
## [25] "Interior Decorating"
## [26] "Interior Design, Bachelor of_Degree"
## [27] "ITS-ESDV"
## [28] "Landscape Technician"
## [29] "Mechanical Engineering Technician_Diploma"
## [30] "Mechanical Engineering Technology"
## [31] "Millwright Techniques (Blank)"
## [32] "Plumbing Techniques"
## [33] "Project Mgmt"
## [34] "Supply Chain Mgmt_GC"
## [35] "Sustainable Energy and Building Tech"
## [36] "Urban Arboriculture - Tree Care (Blank)"
## [37] "Welding Techniques"
## [38] "Wireless Telecommunications"
# kable(table(a))