In each step, you will process your data for common text data issues. Be sure to complete each one in R and Python separately - creating a clean text version in each language for comparison at the end. Update the saved clean text at each step, do not simply just print it out.
##r chunk
library(reticulate)
py_config()
## python: /Users/zosiajiang/Library/r-miniconda/envs/r-reticulate/bin/python
## libpython: /Users/zosiajiang/Library/r-miniconda/envs/r-reticulate/lib/libpython3.6m.dylib
## pythonhome: /Users/zosiajiang/Library/r-miniconda/envs/r-reticulate:/Users/zosiajiang/Library/r-miniconda/envs/r-reticulate
## version: 3.6.10 |Anaconda, Inc.| (default, Mar 25 2020, 18:53:43) [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
## numpy: /Users/zosiajiang/Library/r-miniconda/envs/r-reticulate/lib/python3.6/site-packages/numpy
## numpy_version: 1.19.0
library(stringr)
library(rvest)
## Loading required package: xml2
library(tokenizers)
# py_install('bs4', pip=T)
library(stringi)
library(textclean)
# py_install("contractions",pip = T)
library(hunspell)
# py_install("textblob",pip = T)
# install en-core-web
# py_install("https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz",pip = T)
##python chunk
import requests
from bs4 import BeautifulSoup
import nltk
import spacy
import en_core_web_sm
#this is very common naming to get spacy langauge models started
nlp = en_core_web_sm.load()
import unicodedata
import contractions
import textblob
rvest to import a webpage and process that text for html codes (i.e. take them out)!##r chunk
blog_url <- "https://www.theverge.com/2020/8/1/21350578/tiktok-ban-president-trump-comments-order"
blog_post <- read_html(blog_url)
clean_text_r <-html_text(blog_post)
requests package to import the same webpage and use BeautifulSoup to clean up the html codes.##python chunk
blog_post = requests.get("https://www.theverge.com/2020/8/1/21350578/tiktok-ban-president-trump-comments-order")
content = blog_post.content
clean_content = BeautifulSoup(content)
clean_text_p = clean_content.get_text()
# print(clean_text_p[:60])
##r chunk
clean_text_r <- tolower(clean_text_r)
##python chunk
clean_text_p = clean_text_p.lower()
stringi package to remove any symbols from your text.##r chunk
clean_text_r_no_symbol <- stri_trans_general(str = clean_text_r,
id = "Latin-ASCII")
unicodedata in python to remove any symbols from your text.##python chunk
def remove_accented_chars(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
clean_text_p = remove_accented_chars(clean_text_p)
##r chunk
clean_text_r_replace_1 <- str_replace_all(clean_text_r_no_symbol,
pattern = "’",
replacement = "'")
clean_text_r_no_contract <- replace_contraction(clean_text_r_replace_1,
contraction.key = lexicon::key_contractions,
ignore.case = T)
##python chunk
clean_text_p = contractions.fix(clean_text_p)
hunspell package in R - it’s ok to use the first, most probable option, like we did in class.#r chunk
clean_text_r_tokens <- tokenize_words(clean_text_r_no_contract,
lowercase = T,
stopwords = NULL, #more on this later
strip_punct = T,
strip_numeric = F,
simplify = F)[1]
spelling.errors <- hunspell(as.character(clean_text_r_tokens))
spelling.sugg <- hunspell_suggest(unique(unlist(spelling.errors)), dict = dictionary("en_US"))
#
# # pick the first suggestion
spelling.sugg <- unlist(lapply(spelling.sugg, function(x) x[1]))
spelling.dict <- as.data.frame(cbind(spelling.errors = unique(unlist(spelling.errors)),spelling.sugg))
spelling.dict$spelling.pattern <- paste0("\\b", spelling.dict$spelling.errors, "\\b")
# # replace the words
clean_text_r_tokens_correct <- stri_replace_all_regex(str = c(clean_text_r_tokens)[[1]],
pattern = spelling.dict$spelling.pattern,
replacement = spelling.dict$spelling.sugg,
vectorize_all = FALSE)
clean_text_r_correct <- paste(clean_text_r_tokens_correct, collapse = ' ')
textblob from python.##python chunk
from textblob import TextBlob
clean_text_p_correct = TextBlob(clean_text_p).correct()
clean_text_p_correct = str(clean_text_p_correct)
textstem.##r chunk
library(textstem)
## Loading required package: koRpus.lang.en
## Loading required package: koRpus
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
##
## available.koRpus.lang()
##
## and see ?install.koRpus.lang()
clean_text_r_lemmatized <-lemmatize_strings(clean_text_r_correct)
spacy.##python chunk
def lemmatize_text(text):
text = nlp(text)
text = " ".join([word.lemma_ if word.lemma_ != "-PRON-" else word.text for word in text])
return text
clean_text_p_lemmatized = lemmatize_text(clean_text_p_correct)
##r chunk
library(tm)
## Loading required package: NLP
head(tm::stopwords(kind = "SMART"))
## [1] "a" "a's" "able" "about" "above" "according"
clean_text_r_stopwords <- removeWords(clean_text_r_lemmatized,stopwords(kind = "SMART"))
##python chunk
from nltk.corpus import stopwords
set(stopwords.words('english'))
## {'from', 'ain', "mustn't", 'each', 'who', 'weren', 'my', 'because', 'they', 'ma', 'does', 'between', 'yourself', 'don', 'our', 'being', 'during', 'why', 'nor', 'y', 'couldn', 'over', 'those', "needn't", 'hadn', 'what', 'your', 'again', 'further', 'other', 'such', 'their', "hasn't", 'to', 'can', 'have', "haven't", 'her', 'very', 'out', 'after', 'until', 'did', 'it', 'needn', 'both', 'herself', 'as', 'or', 'isn', 'shouldn', "weren't", "couldn't", 'ours', 'down', 'won', 'than', "hadn't", 'was', 'not', "shouldn't", 'by', 'before', "didn't", "shan't", 'didn', "you'll", 'has', 'yourselves', 'itself', 'how', 'been', "should've", 'too', 'into', 'hasn', 's', 'then', 'a', 'i', 'where', 'be', 'were', 'for', 'more', 'so', 'll', 'mustn', 'yours', 'about', 'against', 'once', 'here', "wasn't", 'no', 'most', 'the', 'through', "it's", 'mightn', "she's", 'below', 'same', 'whom', 'in', 'only', 'few', 'now', 'on', 'of', 'when', 'wouldn', 'up', 'will', 'there', 'had', 'theirs', 'an', 'shan', 'with', "wouldn't", 'is', 'he', 'this', 'o', "don't", 'd', 'hers', 'but', 'him', 'wasn', 've', 'if', 'ourselves', "aren't", "isn't", 'myself', "you'd", 'themselves', 'its', 'aren', 'are', 't', 'some', 'doing', 'these', 'm', "that'll", 'you', 'doesn', 'that', 'me', 'which', 'haven', "mightn't", 'own', 'do', 'off', 'she', 'them', 'and', 'we', "you're", 'having', 'his', 'while', 'all', 'any', 'should', 'himself', 'am', 'under', 're', "won't", 'at', 'above', "you've", "doesn't", 'just'}
clean_text_p_stopwords = [word for word in nltk.word_tokenize(clean_text_p_lemmatized) if word not in stopwords.words('english')]
tokenize_words function to create a set of words for your R clean text.##r chunk
token_r <- tokenize_words(clean_text_r_stopwords,
lowercase = T,
strip_punct = T,
strip_numeric = F,
simplify = F)
nltk or spacy to tokenize the words from your python clean text.##python chunk
token_p = nltk.word_tokenize(str(clean_text_p_stopwords))
#spacy_processed = nlp(str(clean_text_p_stopwords))
#token_p1 = [word.text for word in spacy_processed]
##r chunk
token_r[[1]][1:100]
## [1] "president" "trump" "ban"
## [4] "ticktock" "today" "verge"
## [7] "var" "na" "volume"
## [10] "embed" "host" "http"
## [13] "volume" "cox" "var"
## [16] "chorus" "chorus" "chorus"
## [19] "window" "load" "1"
## [22] "chorus" "add" "script"
## [25] "function" "var" "document"
## [28] "create" "element" "script"
## [31] "sync" "0" "type"
## [34] "text" "java" "script"
## [37] "arc" "function" "type"
## [40] "unload" "var" "na"
## [43] "script" "0" "return"
## [46] "parent" "node" "insert"
## [49] "chorus" "ready" "function"
## [52] "load" "document" "ready"
## [55] "state" "document" "transcendentalist"
## [58] "document" "transcendentalist" "contentedness"
## [61] "document" "attach" "event"
## [64] "noninterchangeable" "function" "load"
## [67] "document" "ready" "state"
## [70] "chorus" "unload" "function"
## [73] "chorus" "window" "load"
## [76] "0" "return" "void"
## [79] "var" "window" "unload"
## [82] "function" "type" "window"
## [85] "unload" "window" "unload"
## [88] "window" "unload" "function"
## [91] "chorus" "unload" "function"
## [94] "chorus" "window" "load"
## [97] "0" "var" "datum"
## [100] "layer"
##python chunk
print(token_p[:100])
#print(token_p1[:100])
## ['[', "'president", "'", ',', "'tramp", "'", ',', "'say", "'", ',', "'ban", "'", ',', "'tikhon", "'", ',', "'us", "'", ',', "'today", "'", ',', "'-", "'", ',', "'verge", "'", ',', "'skin", "'", ',', "'main", "'", ',', "'content", "'", ',', "'clock", "'", ',', "'menu", "'", ',', "'-", "'", ',', "'arrow", "'", ',', "'yes", "'", ',', "'verge", "'", ',', "'homage", "'", ',', "'follow", "'", ',', "'verge", "'", ',', "'online", "'", ',', "'", ':', "'", ',', "'follow", "'", ',', "'verge", "'", ',', "'facebook", "'", ',', "'follow", "'", ',', "'verge", "'", ',', "'twitt", "'", ',', "'ross", "'", ',', "'feed", "'", ',', "'", '(', "'", ',', "'story"]
Note: here you can print out, summarize, or otherwise view your text in anyway you want.