In each step, you will process your data for common text data issues. Be sure to complete each one in R and Python separately - creating a clean text version in each language for comparison at the end. Update the saved clean text at each step, do not simply just print it out.
##r chunk
library(reticulate)
py_config()
## python: /usr/bin/python3
## libpython: /usr/lib/python3.8/config-3.8-x86_64-linux-gnu/libpython3.8.so
## pythonhome: //usr://usr
## version: 3.8.10 (default, Jun 2 2021, 10:49:15) [GCC 9.4.0]
## numpy: /usr/lib/python3/dist-packages/numpy
## numpy_version: 1.17.4
library(stringr)
library(rvest)
library(tokenizers)
library(stringi)
library(textclean)
library(hunspell)
library(textstem)
## Loading required package: koRpus.lang.en
## Loading required package: koRpus
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
##
## available.koRpus.lang()
##
## and see ?install.koRpus.lang()
library(tm)
## Loading required package: NLP
##
## Attaching package: 'tm'
## The following object is masked from 'package:koRpus':
##
## readTagged
##python chunk
import requests
from bs4 import BeautifulSoup
import unicodedata
from nltk.corpus import stopwords
import nltk
import spacy
Pick a movie that you would interested in analyzing. You can also use two one-hour long TV shows or four half-hour long TV shows.
Search for the subtitles for that movie or TV shows. Please note some of the subtitle websites are junk, do not download anything from a popup! Podnapsi appears to be an ok site.
Find the subtitles for your project.
List your movie/show here (note that each group should have a different set of subtitles, I will coordinate). If you have picked a TV show, list the specific episodes.
Use EITHER R or Python to import the text for your movie/tv shows.
##r chunk
url = "https://imsdb.com/scripts/Black-Panther.html"
text = read_html(url)
text_clean = html_text(text)
##python chunk
import requests
blog_post = requests.get("https://imsdb.com/scripts/Black-Panther.html")
content = blog_post.content
clean_content = BeautifulSoup(content)
clean_text_p = clean_content.get_text()
##r chunk
text_clean_lower <- tolower(text_clean)
##python chunk
clean_text_p = clean_text_p.lower()
stringi package to remove any symbols from your text.##r chunk
text_clean_lower_s <- stri_trans_general(str = text_clean_lower,
id = "Latin-ASCII")
unicodedata in python to remove any symbols from your text.##python chunk
import unicodedata
def remove_accented_chars(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
clean_text_p = remove_accented_chars(clean_text_p)
##r chunk
words = unlist(tokenize_words(text_clean_lower_s))
words = str_replace_all(words, pattern = "'",
replacement = "")
replace_contraction(words,
contraction.key = lexicon::key_contractions,
ignore.case = T)[11]
## [1] "function"
##python chunk
import contractions
clean_text_p = contractions.fix(clean_text_p)
hunspell package in R - it’s ok to use the first, most probable option, like we did in class.##r chunk
spelling.errors <- hunspell(text_clean_lower_s)
spelling.sugg <- hunspell_suggest(unlist(spelling.errors), dict = dictionary("en_US"))
# Pick the first suggestion
spelling.sugg <- unlist(lapply(spelling.sugg, function(x) x[1]))
spelling.dict <- as.data.frame(cbind(spelling.errors,spelling.sugg))
spelling.dict$spelling.pattern <- paste0("\\b", spelling.dict$spelling.errors, "\\b")
#Replace the words
replace <-stri_replace_all_regex(str = text_clean_lower_s,
pattern = spelling.dict$spelling.pattern,
replacement = spelling.dict$spelling.sugg,
vectorize_all = FALSE)
textblob from python.##python chunk
from textblob import TextBlob
clean_text_p_correct = TextBlob(clean_text_p).correct()
clean_text_p_correct = str(clean_text_p_correct)
textstem.##r chunk
lemmatization <- lemmatize_words(replace)
spacy.##python chunk
import en_core_web_sm
nlp = en_core_web_sm.load()
def lemmatize_text(text):
text = nlp(text)
text = " ".join([word.lemma_ if word.lemma_ != "-PRON-" else word.text for word in text])
return text
clean_text_p_lemmatized = lemmatize_text(clean_text_p_correct)
##r chunk
library(tm)
head(stopwords(kind = "SMART"))
## [1] "a" "a's" "able" "about" "above" "according"
stop_word <-removeWords(lemmatization, stopwords(kind = "SMART"))
##python chunk
import nltk
from nltk.corpus import stopwords
set(stopwords.words('english'))
## {'so', 'are', 'hers', 'it', 'on', 'not', "couldn't", 'other', 'had', 'she', 'at', "haven't", 'for', "aren't", 'mightn', 'and', 'its', 'my', 'the', 'theirs', 'but', 'i', 'how', "isn't", 'until', "hasn't", 'been', 'because', 'himself', 'don', 'were', 'having', "mustn't", 'no', 'd', 'me', 'up', 'where', 'o', "doesn't", 'shouldn', 'each', 'or', 'now', 'once', 'ours', 'as', "won't", 'before', 'y', 'against', 'myself', 'very', "wasn't", 'has', 'm', 'when', "weren't", 'doing', 'his', 'ma', "shan't", 'yourselves', 'off', 've', "didn't", 'hadn', 'your', 'aren', 'doesn', 'just', 'can', 'than', 'itself', 'any', 'few', 'during', 'haven', 'more', 'same', 'our', "you'll", 'after', "should've", 're', 'needn', 'am', 'should', "don't", 'while', 'own', 'being', 'them', 'such', 'be', 'by', 'him', "it's", 'in', 'll', "that'll", 'weren', 'down', 'couldn', "mightn't", "you're", 'ourselves', 'further', 'above', 'below', 'some', "she's", 'there', 'all', 'most', 'does', 'ain', 'her', 'into', 't', 'is', 'an', 'of', 'a', 'through', "needn't", "hadn't", 'both', 'you', 'nor', 'won', "you've", 'wouldn', 'about', 'have', "you'd", 'they', 'did', 'then', 'that', 'mustn', 'wasn', 'do', 'again', 'only', 'who', 'whom', 'too', "shouldn't", 'out', 'under', 'with', 'over', 'to', 'isn', 's', 'those', 'will', 'themselves', 'we', 'between', 'he', "wouldn't", 'this', 'shan', 'yours', 'here', 'their', 'didn', 'which', 'herself', 'yourself', 'these', 'if', 'from', 'why', 'what', 'was', 'hasn'}
clean_text_p_stopwords = [word for word in nltk.word_tokenize(clean_text_p_lemmatized) if word not in stopwords.words('english')]
tokenize_words function to create a set of words for your R clean text.##r chunk
tokenization <-tokenize_words(stop_word,
lowercase = T,
stopwords = NULL, #more on this later
strip_punct = T,
strip_numeric = F,
simplify = F) #list format
nltk or spacy to tokenize the words from your python clean text.##python chunk
token_p = nltk.word_tokenize(str(clean_text_p_stopwords))
##r chunk
tokenization[[1]][1:100]
## [1] "var" "_gaq"
## [3] "_gaq" "_gaq.push"
## [5] "_setaccount" "ua"
## [7] "3785444" "3"
## [9] "_gaq.push" "_trackpageview"
## [11] "function" "var"
## [13] "ga" "document.createelement"
## [15] "script" "ga.type"
## [17] "text" "javascript"
## [19] "ga.async" "true"
## [21] "ga.src" "https"
## [23] "document.location.protocol" "https"
## [25] "ssl" "http"
## [27] "www" "google"
## [29] "analytics" "ga.js"
## [31] "var" "document.getelementsbytagname"
## [33] "script" "0"
## [35] "parentnode.insertbefore" "ga"
## [37] "internet" "movie"
## [39] "script" "database"
## [41] "imsdb" "web"
## [43] "largest" "movie"
## [45] "script" "resource"
## [47] "e9" "object"
## [49] "e9" "size"
## [51] "728x90" "search"
## [53] "imsdb" "alphabetical"
## [55] "qr" "genre"
## [57] "action" "adventure"
## [59] "animationcomedy" "crime"
## [61] "dramafamily" "fantasy"
## [63] "film" "noirhorror"
## [65] "musical" "mysteryromance"
## [67] "sci" "fi"
## [69] "shortthriller" "war"
## [71] "western" "sponsor"
## [73] "e9" "object"
## [75] "e9" "size"
## [77] "300x250" "tv"
## [79] "transcripts" "futuramaseinfeldsouth"
## [81] "parkstargate" "sg"
## [83] "1lostthe" "4400"
## [85] "international" "french"
## [87] "scripts" "latest"
## [89] "comments" "scripts"
## [91] "black" "panther"
## [93] "written" "ryan"
## [95] "coogler" "joe"
## [97] "robert" "cole"
## [99] "ext" "deep"
##python chunk
print(token_p[:100])
## ['[', "'war", "'", ',', "'gap", "'", ',', "'=", "'", ',', "'gap", "'", ',', "'||", "'", ',', "'", '[', "'", ',', "'", ']', "'", ',', "'", ';', "'", ',', "'gap.push", "'", ',', "'", '(', "'", ',', "'", '[', "'", ',', '``', "'_setaccount", "''", ',', '``', "'", "''", ',', "'", ',', "'", ',', '``', "'", "''", ',', "'", 'a-3785444', "'", ',', "'-", "'", ',', "'", '3', "'", ',', '``', "'", "''", ',', "'", ']', "'", ',', "'", ')', "'", ',', "'", ';', "'", ',', "'gap.push", "'", ',', "'", '(', "'", ',', "'", '[', "'", ',', '``', "'_trackpageview", "''", ',', '``', "'", "''"]
Note: here you can print out, summarize, or otherwise view your text in anyway you want.
##r chunk
##python chunk
QUESTION: Compare the results from your processing. Write a short paragraph answering the following questions. You will need to write more than a few sentences for credit.
ANSWER: The result text of R looks more cleaner to me.
ANSWER: Not exactly the same.
ANSWER: The result of Python also includes punctuations like commas and periods
ANSWER: The python appears easier to me since there are fewer lines in python codes than in R.