The goal of this R script is to extract table information from 38 word docs, in most cases column 1 and 6, for the anomaly files 1 and 5.

library(officer) #load word doc
library(janitor)
library(ggtext)
library(tidyverse)
library(data.table)
library(knitr) 
# Extract Colume Info from Single Word Doc

#
# setwd("/Users/n01635376/OneDrive - Humber College/Desktop/PPDR/FAST")
# doc1<- read_docx('ITS-ESDV_AP_2022.docx')
# content_tbl1<- docx_summary(doc1) %>% as_tibble()
# 
# #extract first column content
# first_col1 <- content_tbl1 %>%
#   filter(cell_id == 5) %>%
#   pull(text)
# 
# goals_ITS <- as.tibble(first_col1)
# goals_ITS <- as.data.frame(goals_ITS)
# 
# goals_ITS<-setnames(goals_ITS, old = c('value'), new = c('Actions'))

Handle Anomalies - Goals in column 5 rather than 6

# 2 programs have "intended goals & outcomes stored in col 5 rather than 6
# this was discovered after running the block below
# these anomaly files are extracted and rbind with the main "goals" file


# Get all doc file names
doc_files1 <- dir(file_path1, pattern = "\\.docx$", full.names = TRUE)

# extract intended goals & outcomes
tbls_goal_anomaly <- lapply(
  doc_files1,
  function(file) {
    tbl <- read_docx(file)
    file_names <- basename(file)
    docx_summary(tbl) %>% as_tibble() %>% filter(cell_id == 5) %>% pull(text) %>% 
      as.tibble() %>% mutate(Program = basename(file)) #program from file name
  }
)

goals_anomaly <- data.frame(do.call(rbind, tbls_goal_anomaly))
names(goals_anomaly)[names(goals_anomaly) == 'value'] <- 'Intended_Goals'
goals_anomaly$Program <- sub("_AP_2022.docx.*", "", goals_anomaly$Program)
goals_anomaly$Year <- "2022"

colnames(goals_anomaly)[colnames(goals_anomaly) == "Intended_Goals"] <- "Actions"
goals_anomaly$Col_Name <- "Intended Goals"

Extract Recommendations & Intended Goals using file name for Program

# Get all doc file names
doc_files <- dir(file_path, pattern = "\\.docx$", full.names = TRUE)

# get program name from file path
# a<- as.data.frame(doc_files)
# a$Program <- gsub("\\.[^.]*$", "", basename(doc_files))
# a$index <- seq_len(nrow(a))


# extract recommendations and program name
tbls <- lapply(
  doc_files,
  function(file) {
    tbl <- read_docx(file)
    file_names <- basename(file)
    docx_summary(tbl) %>% as_tibble() %>% filter(cell_id == 1) %>% pull(text) %>% 
      as.tibble() %>% mutate(Program = basename(file)) #program from file name
  }
)

recommendations <- data.frame(do.call(rbind, tbls)) #convert lists to df
names(recommendations)[names(recommendations) == 'value'] <- 'Recommendations'
recommendations$Program <- sub("_AP_2022.docx.*", "", recommendations$Program)
recommendations$Year <- "2022"


# extract intended goals & outcomes
tbls_goal <- lapply(
  doc_files,
  function(file) {
    tbl <- read_docx(file)
    file_names <- basename(file)
    docx_summary(tbl) %>% as_tibble() %>% filter(cell_id == 6) %>% pull(text) %>% 
      as.tibble() %>% mutate(Program = basename(file)) #program from file name
  }
)

goals <- data.frame(do.call(rbind, tbls_goal))
names(goals)[names(goals) == 'value'] <- 'Intended_Goals'
goals$Program <- sub("_AP_2022.docx.*", "", goals$Program)
goals$Year <- "2022"

colnames(recommendations)[colnames(recommendations) == "Recommendations"] <- "Actions"
recommendations$Col_Name <- "Recommendations"
colnames(goals)[colnames(goals) == "Intended_Goals"] <- "Actions"
goals$Col_Name <- "Intended Goals"

# rbind anomaly with main
goals<-rbind(goals, goals_anomaly)

Rbind “recommendations” and “goals” dataframes and clean

data_Fast<- rbind(goals, recommendations) # rbind 2 df
data_Fast <- data_Fast[order(data_Fast$Program,data_Fast$Col_Name != "Recommendations"), ] #sort rows
data_Fast$Faculty <- "FAST"
data_Fast<-data_Fast[,c(5, 2:4, 1)] #reorder cols
data_Fast <- data_Fast[complete.cases(data_Fast) & trimws(data_Fast$Actions)!="", ] #remove empty rows 

strings_to_exclude <- c("Curriculum Level", "Program Level", "Faculty Level", "Institution Level", "Intended Goals & Outcomes", "Recommendation for Action", "Recommendations for Action", "SUMMARYRECOMMENDATIONS", "Action Plan", "Action Item", "ACADEMIC PROGRAM REVIEW","Measurement of Outcomes")

# Remove rows containing the specified strings in the "Actions" column
data_Fast <- data_Fast[!grepl(paste(strings_to_exclude, collapse = "|"), data_Fast$Actions), ]
a<-data_Fast[,c(2,4)]
a<-unique(a)
unique(a$Program)
##  [1] "AI with Machine Learning(Blank)"              
##  [2] "Architectural Technology"                     
##  [3] "Carpentry and Reno Tech"                      
##  [4] "Carpentry and Reno Techniques (Blank)"        
##  [5] "Civil Engineering Technology"                 
##  [6] "Cloud Computing_GC"                           
##  [7] "Computer Engineering Technology"              
##  [8] "Computer Programming"                         
##  [9] "Computer Programming and Analysis"            
## [10] "Computer Systems Technician - ITIS"           
## [11] "Construction Stackables"                      
## [12] "Design Foundation_Cert"                       
## [13] "Electrical Engineering_Dip_ADip"              
## [14] "Electrical Techniques_Cert"                   
## [15] "Electromechanical Engineering"                
## [16] "Electronics Engineering Technician"           
## [17] "Electronics Engineering Technology"           
## [18] "Engineering, Bachelor of_Degree"              
## [19] "Furniture and Cabinetmaking Techniques(Blank)"
## [20] "HRAC Technician"                              
## [21] "HRAC Technology"                              
## [22] "Industrial Design, Bachelor of"               
## [23] "Industrial Woodworking"                       
## [24] "Industrial Woodworking Technician"            
## [25] "Interior Decorating"                          
## [26] "Interior Design, Bachelor of_Degree"          
## [27] "ITS-ESDV"                                     
## [28] "Landscape Technician"                         
## [29] "Mechanical Engineering Technician_Diploma"    
## [30] "Mechanical Engineering Technology"            
## [31] "Millwright Techniques (Blank)"                
## [32] "Plumbing Techniques"                          
## [33] "Project Mgmt"                                 
## [34] "Supply Chain Mgmt_GC"                         
## [35] "Sustainable Energy and Building Tech"         
## [36] "Urban Arboriculture - Tree Care (Blank)"      
## [37] "Welding Techniques"                           
## [38] "Wireless Telecommunications"
# kable(table(a))