Reading in Data & performing Data Cleaning

Blog post 3 describing reading in Data and performing Data cleaning using libraries as a part of the course “Text as Data”

Rahul Gundeti (Graduate student, Data Analytics & Computational Social Sciences (DACSS), UMass Amherst.)
2022-05-03

Loading required Libraries

Reading in text file containing 1607 stories and assigning it to object “path”

path <- readLines("C:/Users/gunde/Documents/hony.txt")

Converting text into characters

my_char_vec <- c(path)
#converting text to lower case
hony <- tolower(my_char_vec) 
#Assigning dataset as hony(Humans of New York)
hony[1]
[1] "hony stories dataset"

Creating corpus using corpus() function

hony_corpus <- corpus(hony)
hony_corpus
Corpus consisting of 1,607 documents.
text1 :
"hony stories dataset"

text2 :
"in the early days kristen and i would write every single ema..."

text3 :
"both of us quit our jobs. it was nerve-wracking. i remember ..."

text4 :
"a few months after eduardos case i went to a music festival...."

text5 :
"eduardo was so nervous when he came into our office. he bare..."

text6 :
"right before tripp went to prison i sat the kids down one-by..."

[ reached max_ndoc ... 1,601 more documents ]

Featuring Head and Tail of hony corpus

head(hony_corpus)
Corpus consisting of 6 documents.
text1 :
"hony stories dataset"

text2 :
"in the early days kristen and i would write every single ema..."

text3 :
"both of us quit our jobs. it was nerve-wracking. i remember ..."

text4 :
"a few months after eduardos case i went to a music festival...."

text5 :
"eduardo was so nervous when he came into our office. he bare..."

text6 :
"right before tripp went to prison i sat the kids down one-by..."
tail(hony_corpus)
Corpus consisting of 6 documents.
text1602 :
"i used to be a really happy person. i really was. i was the ..."

text1603 :
"i think i have post traumatic stress. i have so many horribl..."

text1604 :
"there was a tumor in his brain. the doctor told us that he k..."

text1605 :
"one of maxs eyes started crossing over when he turned six ye..."

text1606 :
"max had two mothers. we found a sperm donor and i gave birth..."

text1607 :
"my childhood was building things: model rockets, model cars,..."

Tokenizing hony corpus

hony_tokens <- tokens(hony_corpus)
hony_tokens
Tokens consisting of 1,607 documents.
text1 :
[1] "hony"    "stories" "dataset"

text2 :
 [1] "in"      "the"     "early"   "days"    "kristen" "and"    
 [7] "i"       "would"   "write"   "every"   "single"  "email"  
[ ... and 444 more ]

text3 :
 [1] "both"           "of"             "us"            
 [4] "quit"           "our"            "jobs"          
 [7] "."              "it"             "was"           
[10] "nerve-wracking" "."              "i"             
[ ... and 460 more ]

text4 :
 [1] "a"        "few"      "months"   "after"    "eduardos" "case"    
 [7] "i"        "went"     "to"       "a"        "music"    "festival"
[ ... and 440 more ]

text5 :
 [1] "eduardo" "was"     "so"      "nervous" "when"    "he"     
 [7] "came"    "into"    "our"     "office"  "."       "he"     
[ ... and 441 more ]

text6 :
 [1] "right"      "before"     "tripp"      "went"       "to"        
 [6] "prison"     "i"          "sat"        "the"        "kids"      
[11] "down"       "one-by-one"
[ ... and 452 more ]

[ reached max_ndoc ... 1,601 more documents ]

Creating Document Feature matrix using hony tokens

hony_corpus_dfm <- hony_tokens %>%
  tokens_wordstem() %>%
  dfm()
hony_corpus_dfm
Document-feature matrix of: 1,607 documents, 9,521 features (98.66% sparse) and 0 docvars.
       features
docs    honi stori dataset in the earli day kristen and  i
  text1    1     1       1  0   0     0   0       0   0  0
  text2    0     0       0  5  15     1   1       3  14  9
  text3    0     0       0  5  12     0   1       2  10  7
  text4    0     1       0  3  15     0   0       5  10 10
  text5    0     1       0  8  17     0   1       0  13  9
  text6    0     0       0  4   9     0   0       0   8 14
[ reached max_ndoc ... 1,601 more documents, reached max_nfeat ... 9,511 more features ]

Featuring top 100 words

topfeatures(hony_corpus_dfm,100)
     .      i    the      ,     to    and      a    was     my     it 
 36707  15862  12524  11635  11090   9349   9142   7395   6166   6001 
    of    but     me   that     in     he     we    for     so    she 
  5170   4715   4656   4590   4524   3731   3156   3122   3086   2801 
    on     be    her   with   were     im    had    you     at   when 
  2429   2155   2049   1954   1951   1949   1937   1889   1841   1817 
     :   have   like   they    all   just  there   time     is    one 
  1794   1786   1691   1689   1621   1551   1547   1514   1431   1415 
   him     go     up   want   this    his    get  about    our    out 
  1415   1394   1338   1304   1285   1271   1248   1244   1232   1179 
    id   year becaus   been     do  peopl   from  never  thing    not 
  1178   1153   1149   1140   1122   1116   1113   1089   1086   1086 
 alway    day   them    got     if   even   then  didnt   work     an 
  1065   1059   1044    979    975    965    956    954    949    946 
  what     as     us  could  would   back   life  start    now   told 
   922    869    861    849    832    800    788    783    778    776 
   ive     or  think  first    can  everi   love    tri   dont   know 
   768    758    749    748    746    741    735    730    726    722 
   are   feel      ?   much    say   onli   more   your   make   said 
   717    710    708    663    662    661    649    646    645    643 

Wordcloud before pre-processing is done

set.seed(100)
library("quanteda.textplots")
textplot_wordcloud(hony_corpus_dfm, min_count = 10, random_order = FALSE, rotation = 0.15,
    color = RColorBrewer::brewer.pal(8, "Dark2"))

Extracting corpus summary

hony_summary <- summary(hony_corpus)
hony_summary
Corpus consisting of 1607 documents, showing 100 documents:

    Text Types Tokens Sentences
   text1     3      3         1
   text2   219    456         2
   text3   233    472         4
   text4   236    452         2
   text5   219    453         1
   text6   227    464         6
   text7   215    483         1
   text8   228    471         2
   text9   233    492         2
  text10   226    470         1
  text11   217    494         2
  text12   240    486         3
  text13   210    483         5
  text14   219    474         2
  text15   370    946         1
  text16   229    480         1
  text17   218    474         1
  text18     8      9         1
  text19     9     10         1
  text20    18     18         1
  text21    22     25         1
  text22    59     84         1
  text23    67     95         3
  text24   236    563         4
  text25    27     30         1
  text26   221    479         1
  text27   207    500         8
  text28   223    474         1
  text29   228    472         2
  text30   230    454         1
  text31   222    460         2
  text32   201    492         1
  text33   223    467         1
  text34   214    481         1
  text35   225    497         1
  text36   231    464         1
  text37   219    480         2
  text38   221    447         2
  text39   226    490         1
  text40   217    504         2
  text41   209    478         4
  text42   217    485         2
  text43   231    482         4
  text44   225    512         1
  text45   217    498         2
  text46   222    477         1
  text47   219    475         1
  text48   237    479         2
  text49   215    477         1
  text50   232    456         4
  text51   312    796         1
  text52   219    571         1
  text53    21     22         1
  text54   189    525         7
  text55   206    499         4
  text56   222    499         1
  text57   207    503         8
  text58   183    504        24
  text59   185    508         1
  text60   203    498         4
  text61   202    525         5
  text62   215    500         1
  text63   212    513         5
  text64   220    502         3
  text65   235    499         1
  text66   219    487         5
  text67   222    499         2
  text68   222    509         2
  text69   217    510         3
  text70   230    481         1
  text71   237    543         4
  text72   230    463         2
  text73   233    469         2
  text74   276    583         3
  text75   242    498         1
  text76   195    506         3
  text77   193    518         1
  text78   239    481         2
  text79   143    263         1
  text80   233    549         1
  text81   211    496         3
  text82   199    497         1
  text83   193    495         1
  text84   234    480         1
  text85   201    497         1
  text86   220    455         1
  text87   234    488         7
  text88   226    432         1
  text89   206    457         1
  text90     7      7         1
  text91   208    479         4
  text92   220    490         1
  text93   220    468         4
  text94   223    466         3
  text95   205    487         4
  text96   212    470         7
  text97   212    459         1
  text98   208    462         5
  text99   214    466         3
 text100   237    485         2

Pre-processing the hony data

docvars(hony_corpus)
data frame with 0 columns and 1607 rows
#removing extra whitespace
hony0 <- stripWhitespace(hony_corpus) 
#converting into tokens
honytokens <- tokens(hony0)
print(honytokens)
Tokens consisting of 1,607 documents.
text1 :
[1] "hony"    "stories" "dataset"

text2 :
 [1] "in"      "the"     "early"   "days"    "kristen" "and"    
 [7] "i"       "would"   "write"   "every"   "single"  "email"  
[ ... and 444 more ]

text3 :
 [1] "both"           "of"             "us"            
 [4] "quit"           "our"            "jobs"          
 [7] "."              "it"             "was"           
[10] "nerve-wracking" "."              "i"             
[ ... and 460 more ]

text4 :
 [1] "a"        "few"      "months"   "after"    "eduardos" "case"    
 [7] "i"        "went"     "to"       "a"        "music"    "festival"
[ ... and 440 more ]

text5 :
 [1] "eduardo" "was"     "so"      "nervous" "when"    "he"     
 [7] "came"    "into"    "our"     "office"  "."       "he"     
[ ... and 441 more ]

text6 :
 [1] "right"      "before"     "tripp"      "went"       "to"        
 [6] "prison"     "i"          "sat"        "the"        "kids"      
[11] "down"       "one-by-one"
[ ... and 452 more ]

[ reached max_ndoc ... 1,601 more documents ]
#Removing punctuations, numbers, symbols and seperators
honytokens <- tokens(hony_corpus, remove_punct = T, remove_numbers = T, remove_separators = T, remove_symbols = T )
print(honytokens)
Tokens consisting of 1,607 documents.
text1 :
[1] "hony"    "stories" "dataset"

text2 :
 [1] "in"      "the"     "early"   "days"    "kristen" "and"    
 [7] "i"       "would"   "write"   "every"   "single"  "email"  
[ ... and 385 more ]

text3 :
 [1] "both"           "of"             "us"            
 [4] "quit"           "our"            "jobs"          
 [7] "it"             "was"            "nerve-wracking"
[10] "i"              "remember"       "on"            
[ ... and 396 more ]

text4 :
 [1] "a"        "few"      "months"   "after"    "eduardos" "case"    
 [7] "i"        "went"     "to"       "a"        "music"    "festival"
[ ... and 377 more ]

text5 :
 [1] "eduardo" "was"     "so"      "nervous" "when"    "he"     
 [7] "came"    "into"    "our"     "office"  "he"      "barely" 
[ ... and 380 more ]

text6 :
 [1] "right"      "before"     "tripp"      "went"       "to"        
 [6] "prison"     "i"          "sat"        "the"        "kids"      
[11] "down"       "one-by-one"
[ ... and 386 more ]

[ reached max_ndoc ... 1,601 more documents ]

Calling stopwords

length(stopwords("en"))
[1] 175
stopwords("en")
  [1] "i"          "me"         "my"         "myself"     "we"        
  [6] "our"        "ours"       "ourselves"  "you"        "your"      
 [11] "yours"      "yourself"   "yourselves" "he"         "him"       
 [16] "his"        "himself"    "she"        "her"        "hers"      
 [21] "herself"    "it"         "its"        "itself"     "they"      
 [26] "them"       "their"      "theirs"     "themselves" "what"      
 [31] "which"      "who"        "whom"       "this"       "that"      
 [36] "these"      "those"      "am"         "is"         "are"       
 [41] "was"        "were"       "be"         "been"       "being"     
 [46] "have"       "has"        "had"        "having"     "do"        
 [51] "does"       "did"        "doing"      "would"      "should"    
 [56] "could"      "ought"      "i'm"        "you're"     "he's"      
 [61] "she's"      "it's"       "we're"      "they're"    "i've"      
 [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
 [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
 [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
 [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
 [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
 [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
 [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
[101] "who's"      "what's"     "here's"     "there's"    "when's"    
[106] "where's"    "why's"      "how's"      "a"          "an"        
[111] "the"        "and"        "but"        "if"         "or"        
[116] "because"    "as"         "until"      "while"      "of"        
[121] "at"         "by"         "for"        "with"       "about"     
[126] "against"    "between"    "into"       "through"    "during"    
[131] "before"     "after"      "above"      "below"      "to"        
[136] "from"       "up"         "down"       "in"         "out"       
[141] "on"         "off"        "over"       "under"      "again"     
[146] "further"    "then"       "once"       "here"       "there"     
[151] "when"       "where"      "why"        "how"        "all"       
[156] "any"        "both"       "each"       "few"        "more"      
[161] "most"       "other"      "some"       "such"       "no"        
[166] "nor"        "not"        "only"       "own"        "same"      
[171] "so"         "than"       "too"        "very"       "will"      

Removing stopwords

# remove stopwords from our tokens object
hony_clean <- tokens_select(honytokens, pattern = stopwords("en"),
                                           selection = "remove")
length(hony_clean)
[1] 1607
hony_clean[5]
Tokens consisting of 1 document.
text5 :
 [1] "eduardo"     "nervous"     "came"        "office"     
 [5] "barely"      "spoke"       "english"     "told"       
 [9] "story"       "interpreter" "explained"   "hometown"   
[ ... and 186 more ]

Creating DFM after pre-processing is done

hony_clean_dfm <- hony_clean %>%
  tokens_wordstem() %>%
  dfm()
hony_clean_dfm
Document-feature matrix of: 1,607 documents, 9,164 features (99.06% sparse) and 0 docvars.
       features
docs    honi stori dataset earli day kristen write everi singl email
  text1    1     1       1     0   0       0     0     0     0     0
  text2    0     0       0     1   1       3     1     1     1     1
  text3    0     0       0     0   1       2     0     0     0     0
  text4    0     1       0     0   0       5     0     1     0     1
  text5    0     1       0     0   1       0     0     0     0     0
  text6    0     0       0     0   0       0     0     0     0     0
[ reached max_ndoc ... 1,601 more documents, reached max_nfeat ... 9,154 more features ]

Featuring top 50 words after cleaning

topfeatures(hony_clean_dfm,50)
    im   like   just   time    one     go   want    get     id   year 
  1949   1691   1551   1514   1415   1394   1304   1248   1178   1153 
 peopl  never  thing  alway    day    got   even  didnt   work     us 
  1116   1089   1086   1065   1059    979    965    954    949    861 
  back   life  start    now   told    ive  think  first    can  everi 
   800    788    783    778    776    768    749    748    746    741 
  love    tri   dont   know   feel   much    say   make   said   tell 
   735    730    726    722    710    663    662    645    643    607 
  come   call   live school    lot   home    ask    hed   that   look 
   597    581    572    569    566    564    560    544    543    533 

Word cloud after pre-processing is done

set.seed(100)
library("quanteda.textplots")
textplot_wordcloud(hony_clean_dfm, min_count = 10, random_order = FALSE, rotation = 0.15,
    color = RColorBrewer::brewer.pal(8, "Dark2"))