Assignment 1: Text Data Preparation

#Setting

knitr::opts_chunk$set(echo = TRUE)
library(quanteda)

## Package version: 4.3.1
## Unicode version: 14.0
## ICU version: 71.1

## Parallel computing: disabled

## See https://quanteda.io for tutorials and examples.

library(readtext)

## 
## Attaching package: 'readtext'

## The following object is masked from 'package:quanteda':
## 
##     texts

#Answer Q1 Q1. Read all .txt files from the folder titled data under Assignment 1, generate a corpus. Print the ID and the content of first 5 documents with a limit of 100 characters. ##Steps: ###Step1:Read all files and its content. ###Step2:Generate a corpus. ###Step3:Print the ID and the content of first 5 documents with a limit of 100 characters.

# Read all .txt files from the data directory
file_list <- list.files(params$data_dir, pattern = "\\.txt$", full.names = TRUE)
movie_texts <- readtext(file_list)

# Generate the corpus
mycorpus <- corpus(movie_texts)

# Print the first 5 documents with ID and truncated content
for (i in 1:5) {
  doc_content <- substr(as.character(mycorpus[i]), 1, 100)
  cat("Document ID:", docnames(mycorpus)[i], "\n")
  cat("Content:", doc_content, "\n\n")
}

## Document ID: 11_8.txt 
## Content: I was looking forward to The Guardian, but when I walked into the theater I wasn't really in the moo 
## 
## Document ID: 12_9.txt 
## Content: I was pleasantly surprised to find this movie showing as a sneak preview in my local theater.<br />< 
## 
## Document ID: 13_9.txt 
## Content: I work at a movie theater and every Thursday night we have an employee screening of one movie that c 
## 
## Document ID: 14_8.txt 
## Content: I attended an advance screening of this film not sure of what to expect from Kevin Costner and Ashto 
## 
## Document ID: 15_10.txt 
## Content: As others that have commented around the web... I'm a 130 pilot in the Coast Guard. Having said that

#Answer Q2 Q2. Each document has a name (e.g. 11_8.txt), extract the first two digits from the name, make that integer, and assign that integer as a document level variable named “Doc_ID”. Print that variable for all of your documents. ##Step1:extract the first two digits ##Step2:assign as a docvas ##Step3:print the docvas

#Extract the first two digits from document names
doc_names <- docnames(mycorpus)
first_two_chars <- substr(doc_names, 1, 2)
doc_id_values <- as.integer(first_two_chars)

#Assign as a document variable named "Doc_ID"
docvars(mycorpus, "Doc_ID") <- doc_id_values

#Print the document variable for all documents
cat("Document IDs for all documents:\n")

## Document IDs for all documents:

cat("===============================\n")

## ===============================

print(docvars(mycorpus, "Doc_ID"))

##  [1] 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
## [26] 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60

#Answer Q3 Q3. Keep only those documents with an ID larger than 20. Print the KWIC (key word in context) for the term “film” for all documents with ID larger than 20. ##Step1:filter documents ##Step2:tokenize the filtered corpus ##Step3:using KWIC function to search for the term “film” ##Step4:print

#Select documents with ID larger than 20
corpus_filtered <- corpus_subset(mycorpus, Doc_ID > 20)

#Tokenize the corpus first
tokens_filtered <- tokens(corpus_filtered)

#Use KWIC function on the tokenized object
kwic_results <- kwic(tokens_filtered, "film", window = 5)

#Print the KWIC results
cat("KWIC results for 'film' in documents with ID > 20:\n")

## KWIC results for 'film' in documents with ID > 20:

cat("==================================================\n")

## ==================================================

print(kwic_results)

## Keyword-in-context with 53 matches.                                                                    
##    [21_9.txt, 11]          a small but absolutely wonderful | film |
##    [21_9.txt, 28]                    in the world this tiny | film |
##   [21_9.txt, 216]                     heart and soul of the | film |
##    [22_8.txt, 37]                  the era portrayed in the | film |
##   [22_8.txt, 180]                  part of the portrait the | film |
##   [22_8.txt, 329]                     of the tragedy of the | film |
##   [22_8.txt, 347]                        , and makes this a | film |
##   [22_8.txt, 374]               and cinematic style of this | film |
##   [22_8.txt, 461]                  after the war makes this | film |
##   [23_10.txt, 20]                              < br/ > this | film |
##    [26_10.txt, 2]                                      This | film |
##   [26_10.txt, 32]                             ) - - but the | film |
##    [28_9.txt, 13]     enjoyed seeing this truly magnificent | film |
##   [28_9.txt, 232]                     of acting. Watch this | film |
##   [28_9.txt, 261]                   / 10 for this excellent | film |
##   [29_10.txt, 58]               great supporting cast. This | film |
##   [30_9.txt, 239]                              < br/ > This | film |
##  [31_10.txt, 207]              > This is a character-driven | film |
##   [32_10.txt, 15]         Santiago-Hudson and an even finer | film |
##   [32_10.txt, 30]                the screen. This brilliant | film |
##  [32_10.txt, 282]                      In the course of the | film |
##  [32_10.txt, 391]                       years, and had this | film |
##   [34_8.txt, 122] Although it's clearly an African-American | film |
##   [34_8.txt, 269]               Imperfect but likable, good | film |
##     [35_9.txt, 7]                         once in a while a | film |
##    [35_9.txt, 19]                          know and love. A | film |
##    [35_9.txt, 49]                > Lackawanna Blues is that | film |
##    [35_9.txt, 82]                     based on a true story | film |
##   [35_9.txt, 182]                      no mistake; see this | film |
##   [37_9.txt, 136]                       > The music in this | film |
##   [37_9.txt, 155]                     And what's more, this | film |
##   [37_9.txt, 185]                 > I highly recommend this | film |
##   [37_9.txt, 204]              , very entertaining, quality | film |
##    [38_8.txt, 37]                guardianship of Nanny. The | film |
##    [38_8.txt, 68]                memoirs that are made into | film |
##   [38_8.txt, 143]                      But, overall a great | film |
##     [41_9.txt, 4]                          What a wonderful | film |
##  [42_10.txt, 105]                 was a made for television | film |
##  [42_10.txt, 152]                        / > I enjoyed this | film |
##    [44_8.txt, 46]               and lets you experience the | film |
##   [48_9.txt, 169]             in writing and producing this | film |
##   [48_9.txt, 178]                   who better to produce a | film |
##   [48_9.txt, 213]               or directors they make such | film |
##    [49_10.txt, 7]                    Blues is a very moving | film |
##   [49_10.txt, 28]                          a young boy. The | film |
##  [49_10.txt, 158]                     . I truly enjoyed the | film |
##    [57_10.txt, 4]                               I gave this | film |
##   [57_10.txt, 31]                         it would not be a | film |
##   [57_10.txt, 63]                This is a character driven | film |
##    [58_8.txt, 19]        enjoyable musical soundtrack. This | film |
##   [58_8.txt, 101]               erupted in applause for the | film |
##  [59_10.txt, 219]                 course ) I recommend this | film |
##    [60_8.txt, 25]              Ms. Merkerson dominates this | film |
##                                      
##  , brilliantly acted by Albert       
##  attracted enough attention to garner
##  . < br/ >                           
##  . While Courtenay's character is    
##  paints of a time and                
##  and play comes from the             
##  about him, as the                   
##  seem to come from another           
##  all the more poignant,              
##  . Taken from Ronald'                
##  has its detractors, and             
##  is top notch in every               
##  in the theater when it              
##  and enjoy! I recommend              
##  ( 1 point missing for               
##  takes you on an emotional           
##  has great energy, bringing          
##  ; the plot is secondary             
##  as the author adapted his           
##  ignites the screen with rich        
##  we are introduced to the            
##  been released in the theaters       
##  , being set in the                  
##  for a rainy day.                    
##  comes along with characters we      
##  where you see people you            
##  . Set in 1960s upstate              
##  , is told from a                    
##  for the amazing performance of      
##  perfectly blends with the story     
##  is actually entertaining. It        
##  to anyone wanting to spend          
##  .                                   
##  gives a good character study        
##  . The supporting cast adds          
##  with a truly outstanding performance
##  , filled with eccentric,            
##  . It was so well                    
##  so much I bought the                
##  the way the writer really           
##  . I guess who better                
##  based on your life other            
##  feel like you were actually         
##  depicting an era filled with        
##  clearly supports" it takes          
##  and eagerly await the DVD           
##  my rare 10 stars.                   
##  with a strong plot line             
##  , a true story,                     
##  reaches back to the 40              
##  and its director. <                 
##  as a fun and colorful               
##  with her tremendous presence as

#Answer Q4 Q4. Tokenize the updated corpus, keep numbers but remove the punctuation and symbols. Print the result for the 1st document (with the ID of 11). ##Step1:tokenize the corpus ##Step2:print

#Tokenize the updated corpus, keeping numbers but removing punctuation and symbols
corpus_tokens <- tokens(mycorpus, 
                       remove_punct = TRUE, 
                       remove_symbols = TRUE,
                       remove_numbers = FALSE)

#Print the tokenization result for the 1st document (with ID of 11)
doc_id_11 <- which(docvars(mycorpus, "Doc_ID") == 11)
cat("Tokenization results for document with ID 11:\n")

## Tokenization results for document with ID 11:

cat("=============================================\n")

## =============================================

print(corpus_tokens[doc_id_11])

## Tokens consisting of 1 document and 1 docvar.
## 11_8.txt :
##  [1] "I"        "was"      "looking"  "forward"  "to"       "The"     
##  [7] "Guardian" "but"      "when"     "I"        "walked"   "into"    
## [ ... and 581 more ]

#Answer Q5 Q5. With all documents, combine ‘own’, ‘sir’, and ‘god’ along with stopwords(‘english’) to a vector called mystopwords, then create a data feature matrix (DFM) of the corpus, while removing mystopwords and punctuation. Print the top features (terms) of that DFM. ##Step1:combine and transform into the vector “mystopwords” ##Step2:tokenize ##Step3:create DFM while removing mystopwords and punctuation ##Step4:print

#combine and transform into the vector "mystopwords"
mystopwords <- c('own', 'sir', 'god', stopwords('english'))

#tokenize
corpus_tokens <- tokens(mycorpus, remove_punct = TRUE)

#create DFM while removing mystopwords and punctuation
corpus_dfm <- dfm(corpus_tokens)
corpus_dfm <- dfm_remove(corpus_dfm, mystopwords)

#print the top feactures
cat("Top features in the DFM (after removing custom stopwords and punctuation):\n")

## Top features in the DFM (after removing custom stopwords and punctuation):

cat("=========================================================================\n")

## =========================================================================

top_features <- topfeatures(corpus_dfm, n = 20)
print(top_features)

##          <         br          >      movie       film      great      story 
##        240        240        240         76         70         52         42 
##        one       like      nanny       good characters       time       life 
##         37         35         34         32         32         31         25 
##        see    dresser          s  merkerson     really      actor 
##         24         24         23         23         22         22

#Answer Q6 Q6. After creating DFM, try to calculate tf-idf for the DFM. Print the top 10 features with highest tf-idf. ##Step1:calculate tf-idf ##Step2:print

#Calculate TF-IDF for the DFM
corpus_tfidf <- dfm_tfidf(corpus_dfm)

#Print the top 10 features with highest TF-IDF values
tfidf_scores <- colSums(corpus_tfidf)
top_10_features <- sort(tfidf_scores, decreasing = TRUE)[1:10]

cat("Top 10 features with highest TF-IDF scores:\n")

## Top 10 features with highest TF-IDF scores:

cat("==========================================\n")

## ==========================================

print(top_10_features)

##        <       br        >    movie     film  dresser    great    nanny 
## 40.19786 40.19786 40.19786 24.22567 16.56004 15.78186 15.65356 14.28736 
##      one     like 
## 13.93978 13.92790

Assignment 1: Text Data Preparation

Qiao Qiao

2025-09-16