< 먼저 파일을 가져오고 정리하기.>
# 파일 불러오고 복사본 저장하기
brown <- scan(file.choose(), what="char", sep="\n", quote="", comment.char="")
brown.ori <- brown
head(brown)
## [1] "The Fulton County Grand Jury said Friday an investigation of Atlanta's recent primary election produced no evidence that any irregularities took place."
## [2] "The jury further said in term-end presentments that the City Executive Committee, which had over-all charge of the election, deserves the praise and thanks of the City of Atlanta for the manner in which the election was conducted."
## [3] "The September-*october term jury had been charged by Fulton Superior Court Judge Durwood Pye to investigate reports of possible irregularities in the hard-fought primary which was won by Mayor-nominate Ivan Allen Jr.."
## [4] "Only a relative handful of such reports was received, the jury said, considering the widespread interest in the election, the number of voters and the size of this city."
## [5] "The jury said it did find that many of Georgia's registration and election laws are outmoded or inadequate and often ambiguous."
## [6] "It recommended that Fulton legislators act to have these laws studied and revised to the end of modernizing and improving them."
1.How many sentences are in the corpus?
#clean up(소문자로 바꾸고, 문장기호 스페이스로 바꾸고, 스페이스 2칸은 1칸으로 바꾸기)
tolower(brown)->brown
gsub("[[:punct:]]"," ",brown )->brown
gsub("[0-9]", "", brown)-> brown
brown<-gsub(" +"," ",brown)
brown<-gsub("^ +","",brown)
brown<-gsub(" +$","",brown)
head(brown)
## [1] "the fulton county grand jury said friday an investigation of atlanta s recent primary election produced no evidence that any irregularities took place"
## [2] "the jury further said in term end presentments that the city executive committee which had over all charge of the election deserves the praise and thanks of the city of atlanta for the manner in which the election was conducted"
## [3] "the september october term jury had been charged by fulton superior court judge durwood pye to investigate reports of possible irregularities in the hard fought primary which was won by mayor nominate ivan allen jr"
## [4] "only a relative handful of such reports was received the jury said considering the widespread interest in the election the number of voters and the size of this city"
## [5] "the jury said it did find that many of georgia s registration and election laws are outmoded or inadequate and often ambiguous"
## [6] "it recommended that fulton legislators act to have these laws studied and revised to the end of modernizing and improving them"
#문장 개수
length(brown)
## [1] 51763
#browncorpus에 나온 단어들의 총 개수를 물어보는 것 같음.
#split the corpus (tokenization)
words.list.brown <- strsplit(brown," ")
unlist.brown <- unlist(words.list.brown)
unlist.brown[unlist.brown!=""]->unlist.brown
#단어 개수
length(unlist.brown)
## [1] 1023243
#browncorpus에 나온 단어들의 종류를 물어보는 것 같음.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
wordcount<-table(unlist.brown)
length(wordcount)
## [1] 41144
grep("^[aeiou]", unlist.brown)->vowel.start.words
head(vowel.start.words,20)
## [1] 8 9 10 11 15 18 20 21 28 30 35 39 40 42 44 48 50 53 54 58
length(grep("^[aeiou]", unlist.brown))
## [1] 294097
grep("ness$", unlist.brown)->vowel.start.words
head(vowel.start.words,20)
## [1] 2851 4187 5108 5198 6368 8914 9001 9756 10316 10456 11682 12272
## [13] 14193 15461 15528 16428 18450 23483 24131 25812
length(grep("ness$", unlist.brown))
## [1] 1468
grep('(z){2}', unlist.brown)->vowel.start.words
head(vowel.start.words,20)
## [1] 24212 26099 30015 30798 31935 34484 46518 46581 46591 50122
## [11] 50184 57413 60902 63814 64405 75792 81492 90758 111447 113479
length(grep('(z){2}', unlist.brown))
## [1] 351
#맞는지 확인
head(unlist.brown[24212])
## [1] "grizzlies"
head(unlist.brown[26099])
## [1] "fizzled"
grep('aa|ee|ii|oo|uu|ae|ea|ai|ia|ao|oa|au|ua|ei|ie|eo|oe|eu|ue|io|oi|iu|ui|ou|uo', unlist.brown)->vowel.start.words
head(vowel.start.words,20)
## [1] 3 6 9 15 21 22 27 36 44 47 61 70 74 75 77 84 88 107 110
## [20] 113
length(grep('aa|ee|ii|oo|uu|ae|ea|ai|ia|ao|oa|au|ua|ei|ie|eo|oe|eu|ue|io|oi|iu|ui|ou|uo', unlist.brown))
## [1] 177613
#맞는지 확인_은채님이 말씀하신 것처럼 아래와 같이 3개이상도 나오는 문제를 발견.
head(unlist.brown[21])
## [1] "irregularities"
head(unlist.brown[3])
## [1] "county"
head(unlist.brown[107])
## [1] "received"
grep("(a|e|i|o|u){2,}", unlist.brown)->vowel.start.words
head(vowel.start.words,20)
## [1] 3 6 9 15 21 22 27 36 44 47 61 70 74 75 77 84 88 107 110
## [20] 113
length(grep("(a|e|i|o|u){2,}", unlist.brown))
## [1] 177613
#맞는지 확인
head(unlist.brown[27])
## [1] "said"
head(unlist.brown[44])
## [1] "election"
grep("x{2}", unlist.brown)->x.words
head(x.words,20)
## [1] 77178
length(grep("x{2}", unlist.brown))
## [1] 1
#맞는지 확인
head(unlist.brown[77178])
## [1] "foxx"
#mail이라는 단어가 들어간 것은 찾았지만, email address를 찾지는 못했습니다.
grep("*mail", unlist.brown)->x.words
head(x.words,20)
## [1] 20029 20055 20069 31725 31743 41290 43164 52867 63276 63366
## [11] 63557 88043 103086 103105 103125 103217 103266 143060 212855 222803
length(grep("*mail", unlist.brown))
## [1] 99
#맞는지 확인
head(unlist.brown[20029])
## [1] "mails"
#원하는 결과가 아닌 것 같음.