a data set take from a kaggle .
data=read.csv(file.path("C:/Users/ELCOT/Documents/wordprocess.csv"),header=T)
head(data)
attach(data)
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stopwords)
## Warning: package 'stopwords' was built under R version 4.1.3
library(igraph)
## Warning: package 'igraph' was built under R version 4.1.3
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
## Warning: package 'ggraph' was built under R version 4.1.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.3
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.1.3
head(data,50)
a summary of data set take about a mean, class and mode and length of data set
summary(data)
## gender race.ethnicity parental.level.of.education
## Length:1000 Length:1000 Length:1000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## lunch test.preparation.course math.score reading.score
## Length:1000 Length:1000 Min. : 0.00 Min. : 17.00
## Class :character Class :character 1st Qu.: 57.00 1st Qu.: 59.00
## Mode :character Mode :character Median : 66.00 Median : 70.00
## Mean : 66.09 Mean : 69.17
## 3rd Qu.: 77.00 3rd Qu.: 79.00
## Max. :100.00 Max. :100.00
## writing.score
## Min. : 10.00
## 1st Qu.: 57.75
## Median : 69.00
## Mean : 68.05
## 3rd Qu.: 79.00
## Max. :100.00
str(data)
## 'data.frame': 1000 obs. of 8 variables:
## $ gender : chr "female" "female" "female" "male" ...
## $ race.ethnicity : chr "group B" "group C" "group B" "group A" ...
## $ parental.level.of.education: chr "bachelor's degree" "some college" "master's degree" "associate's degree" ...
## $ lunch : chr "standard" "standard" "standard" "free/reduced" ...
## $ test.preparation.course : chr "none" "completed" "none" "none" ...
## $ math.score : int 72 69 90 47 76 71 88 40 64 38 ...
## $ reading.score : int 72 90 95 57 78 83 95 43 64 60 ...
## $ writing.score : int 74 88 93 44 75 78 92 39 67 50 ...
Tokenization is the process of dividing text into a set of meaningful pieces.
data_tokens = unnest_tokens(data, word,parental.level.of.education)
head(data_tokens,100)
tokens_stop=data_tokens %>% filter(!word %in% stop_words$word)
head(tokens_stop,100)
ng = data %>% unnest_tokens(word,parental.level.of.education, token = "ngrams", n=2)%>%
separate(word, c("word1","word2"),sep = " ") %>% filter(!word1 %in% stop_words$word)%>% filter(!word2 %in% stop_words$word)%>% unite(word, word1,word2,sep=" ")%>% count(word, sort =TRUE)
head(ng,100)
s=ng%>%separate(word,c("word1","word2"),sep = " ")
cn=s %>% filter(n>=2)
head(cn,100)
w_n=cn %>% count(word1,word2,directed=TRUE) %>% graph_from_data_frame()
w_n
## IGRAPH 0817723 DN-- 4 3 --
## + attr: name (v/c), directed (e/l), n (e/n)
## + edges from 0817723 (vertex names):
## [1] associate's->degree bachelor's ->degree master's ->degree
set.seed(20181005)
a=arrow(angle=20,length=unit(0.1,"inches"),ends="last",type="open")
ggraph(w_n,layout="fr")+geom_edge_link(aes(color=n,width=n),arrow=a)+
geom_node_point()+ geom_node_text(aes(label=name),vjust=1,hjust=1)
word_freq=tokens_stop %>% count(word,sort = TRUE)
head(word_freq,100)
wordcloud2(data = word_freq, size=0.4, color = "random-light",minRotation = -pi/6,backgroundColor=rainbow(2), maxRotation = -pi/6, rotateRatio = 1,shape="circle")
From the word cloud, we can easily visualise that education and degree are the words which have been quoted mostly by the students.
From the visualization and pre processing of the input reviews, it
told that
a parent level of education is degree. It pre processing will prove.