In each step, you will process your data for common text data issues. Be sure to complete each one in R and Python separately - creating a clean text version in each language for comparison at the end. Update the saved clean text at each step, do not simply just print it out.
##r chunk
library(rvest)
## Loading required package: xml2
library(stringr)
library(stringi)
library(textclean)
library(hunspell)
library(tokenizers)
library(textstem)
## Loading required package: koRpus.lang.en
## Loading required package: koRpus
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
##
## available.koRpus.lang()
##
## and see ?install.koRpus.lang()
library(tm)
## Loading required package: NLP
library(reticulate)
py_config()
## python: /Users/emilyhuang/Library/r-miniconda/envs/r-reticulate/bin/python
## libpython: /Users/emilyhuang/Library/r-miniconda/envs/r-reticulate/lib/libpython3.6m.dylib
## pythonhome: /Users/emilyhuang/Library/r-miniconda/envs/r-reticulate:/Users/emilyhuang/Library/r-miniconda/envs/r-reticulate
## version: 3.6.10 |Anaconda, Inc.| (default, Mar 25 2020, 18:53:43) [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
## numpy: /Users/emilyhuang/Library/r-miniconda/envs/r-reticulate/lib/python3.6/site-packages/numpy
## numpy_version: 1.19.1
##python chunk
import requests
from bs4 import BeautifulSoup
import unicodedata
from nltk.corpus import stopwords
import nltk
import spacy
rvest to import a webpage and process that text for html codes (i.e. take them out)! py_config()##r chunk
library(rvest)
url = 'https://harvest.org/resources/gregs-blog/post/kobe-bryants-death-reminds-us-life-is-not-fair/'
text = read_html(url)
text_clean = html_text(text)
requests package to import the same webpage and use BeautifulSoup to clean up the html codes. repl_python()##python chunk
import requests
text = requests.get("https://harvest.org/resources/gregs-blog/post/kobe-bryants-death-reminds-us-life-is-not-fair/")
content = text.content
from bs4 import BeautifulSoup
content_clean = BeautifulSoup(content)
text_clean_py = content_clean.get_text()
##r chunk
text_clean_lower <- tolower(text_clean)
##python chunk
text_clean_py_lower = text_clean_py.lower()
stringi package to remove any symbols from your text.##r chunk
library(stringi)
text_clean_lower_s <- stri_trans_general(str = text_clean_lower,
id = "Latin-ASCII")
unicodedata in python to remove any symbols from your text.##python chunk
import unicodedata
def remove_accent(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
text_1 = remove_accent(text_clean_py_lower)
##r chunk
library(textclean)
words = unlist(tokenize_words(text_clean_lower_s))
words = str_replace_all(words, pattern = "'",
replacement = "")
replace_contraction(words,
contraction.key = lexicon::key_contractions,
ignore.case = T)[11]
## [1] "bryants"
hunspell package in R - it’s ok to use the first, most probable option, like we did in class.##r chunk
spelling.errors <- hunspell(text_clean_lower_s)
spelling.sugg <- hunspell_suggest(unlist(spelling.errors), dict = dictionary("en_US"))
# Pick the first suggestion
spelling.sugg <- unlist(lapply(spelling.sugg, function(x) x[1]))
spelling.dict <- as.data.frame(cbind(spelling.errors,spelling.sugg))
spelling.dict$spelling.pattern <- paste0("\\b", spelling.dict$spelling.errors, "\\b")
#Replace the words
replace <-stri_replace_all_regex(str = text_clean_lower_s,
pattern = spelling.dict$spelling.pattern,
replacement = spelling.dict$spelling.sugg,
vectorize_all = FALSE)
textblob from python.##python chunk
def remove_accent(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
text_1 = remove_accent(text_1)
textstem.##r chunk
lemmatization <- lemmatize_words(replace)
spacy.##python chunk
#import spacy
#sp = spacy.load('en_core_web_sm')
#sentence = sp(text_1)
#for word in sentence:
# print(word.text)
##r chunk
head(stopwords(kind = "SMART"))
## [1] "a" "a's" "able" "about" "above" "according"
stop_word <-removeWords(lemmatization, stopwords(kind = "SMART"))
##python chunk
from nltk.corpus import stopwords
set(stopwords.words('english'))
## {'at', 'after', 'below', 'y', 'we', "won't", 'because', 'which', 'nor', 'until', 'this', 'few', 'herself', 'such', 'was', "mightn't", 'wouldn', 'shouldn', 'am', "didn't", 'i', 'no', 'himself', 'did', 'over', 'out', 'hers', 'll', 'above', 'from', 'weren', 'just', 'who', 'itself', 'mightn', 'through', 'during', 'theirs', 'while', 'other', 'my', 'their', 'on', 'further', 'aren', 'for', 's', 'is', 't', 'under', 'wasn', 'ain', 'what', 'ours', 'these', 'be', 'of', 'hasn', 'she', 'more', 'into', 'his', "you'll", 'some', 'he', 'having', 'why', 'any', 'myself', 'don', 'too', 'off', 'all', "she's", 'to', 'very', 've', "you're", 'only', 'those', 'd', 'mustn', 'couldn', "needn't", 'won', 'in', 'been', 'each', 'ourselves', 'against', 'yours', 'your', 'now', 'that', 'had', 'by', "should've", 'down', 'than', 'should', "that'll", 'me', 're', 'you', 'him', 'were', "mustn't", 'isn', 'm', "wouldn't", 'haven', 'an', 'again', 'ma', 'how', 'own', "you'd", 'but', 'has', 'and', 'here', 'when', "doesn't", 'does', "you've", 'needn', 'its', 'between', 'with', 'then', 'will', 'doesn', 'up', "weren't", "aren't", 'can', 'are', "hadn't", "hasn't", 'they', 'o', 'do', 'where', 'our', 'both', 'yourself', 'it', 'whom', "it's", 'or', 'have', 'doing', "shan't", "shouldn't", 'shan', 'as', "couldn't", "wasn't", 'once', 'didn', "haven't", 'there', 'same', 'her', 'the', 'about', 'themselves', 'hadn', 'if', 'most', 'yourselves', 'being', 'before', "don't", "isn't", 'a', 'not', 'them', 'so'}
tokenize_words function to create a set of words for your R clean text.##r chunk
library(tokenizers)
tokenization <-tokenize_words(stop_word,
lowercase = T,
stopwords = NULL, #more on this later
strip_punct = T,
strip_numeric = F,
simplify = F) #list format
nltk or spacy to tokenize the words from your python clean text.##python chunk
import nltk
token = nltk.sent_tokenize(text_1)
token_words = nltk.word_tokenize(text_1)
##r chunk
tokenization[[1]][1:100]
## [1] "function" "html"
## [3] "html.classname" "html.classname.replace"
## [5] "bno" "js"
## [7] "js" "document.documentelement"
## [9] "kobe" "bryant"
## [11] "death" "reminds"
## [13] "life" "fair"
## [15] "cdata" "var"
## [17] "gtm4wp_datalayer_name" "datalayer"
## [19] "var" "datalayer"
## [21] "datalayer" "context"
## [23] "https" "schema.org"
## [25] "graph" "type"
## [27] "website" "id"
## [29] "https" "harvest.org"
## [31] "website" "url"
## [33] "https" "harvest.org"
## [35] "harvest" "potentialaction"
## [37] "type" "searchaction"
## [39] "target" "https"
## [41] "harvest.org" "search_term_string"
## [43] "query" "input"
## [45] "required" "search_term_string"
## [47] "type" "imageobject"
## [49] "id" "https"
## [51] "harvest.org" "resources"
## [53] "gregs" "blog"
## [55] "post" "kobe"
## [57] "bryants" "death"
## [59] "reminds" "life"
## [61] "fair" "primaryimage"
## [63] "url" "https"
## [65] "harvest.org" "harvestsite"
## [67] "2020" "01"
## [69] "58cc03bb" "gettyimages"
## [71] "1196594255" "jpg"
## [73] "width" "1024"
## [75] "height" "683"
## [77] "type" "webpage"
## [79] "id" "https"
## [81] "harvest.org" "resources"
## [83] "gregs" "blog"
## [85] "post" "kobe"
## [87] "bryants" "death"
## [89] "reminds" "life"
## [91] "fair" "webpage"
## [93] "url" "https"
## [95] "harvest.org" "resources"
## [97] "gregs" "blog"
## [99] "post" "kobe"
##python chunk
# print(words[1:100])
Note: here you can print out, summarize, or otherwise view your text in anyway you want.