install.packages("tidytext")
install.packages("tidyr")
install.packages('quanteda')
install.packages('readtext')
library('quanteda')


# Q1:Read all .txt files from the folder titled data under Assignment 1, generate a corpus. Print the corpus.

setwd("~/Desktop/PPOL Spring Semester 2022/IPEN 5250 Text analysis and Machine Learning/assignments/Assignment 1/Data")
my_data <- readtext("~/Desktop/PPOL Spring Semester 2022/IPEN 5250 Text analysis and Machine Learning/assignments/Assignment 1/Data")
my_corpus <- corpus(my_data)
print(my_corpus)
  

#Q2: Each document has a name (e.g. 11_8.txt), extract the first two digits from the name, make that integer, and assign that integer as a document variable (docvars) named “ID”. Print the head of the summary of the updated corpus.
library('quanteda')
docvars(my_corpus, "ID") <- as.integer(substring(docid(my_corpus), 1, 2))
head(my_corpus)

#Q3: Keep only those documents with an ID larger than 20. Print the head of the summary of the updated corpus.
my_corpus <- corpus_subset(my_corpus, ID>20)
head(my_corpus)


#Q4. Tokenize the updated corpus, keep numbers but remove the punctuation. Print the head of the summary of the updated corpus
my_corpus <- tokens(my_corpus, remove_punct = TRUE, remove_numbers = TRUE)
head(my_corpus)


#Q5. Combine 'own', 'sir', and 'god' along with stopwords('english') to a vector called mystopwords, then create a data feature matrix (DFM) of the corpus, while removing mystopwords and punctuation. Print the top features of that dfm.
stopwords('english')
eng.stopwords = stopwords('english')
eng.stopwords = c("own","sir","god", eng.stopwords)
myDfm = dfm(my_corpus, remove=eng.stopwords)
head(myDfm)


# After creating DFM, try to calculate count, frequency, and tf-idf for the DFM. Print the top features of each type of calculation.
ndoc(myDfm)
nfeat(myDfm)

dfm_tfidf(
  myDfm,
  scheme_tf = "count",
  scheme_df = "inverse",
  base = 10,
  force = FALSE)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.