import a data set

a data set take from a kaggle .

data=read.csv(file.path("C:/Users/ELCOT/Documents/wordprocess.csv"),header=T)
head(data)

libraries

attach(data)
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stopwords)
## Warning: package 'stopwords' was built under R version 4.1.3
library(igraph)
## Warning: package 'igraph' was built under R version 4.1.3
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)
## Warning: package 'ggraph' was built under R version 4.1.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.3
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.1.3
head(data,50)

summary

a summary of data set take about a mean, class and mode and length of data set

summary(data)
##     gender          race.ethnicity     parental.level.of.education
##  Length:1000        Length:1000        Length:1000                
##  Class :character   Class :character   Class :character           
##  Mode  :character   Mode  :character   Mode  :character           
##                                                                   
##                                                                   
##                                                                   
##     lunch           test.preparation.course   math.score     reading.score   
##  Length:1000        Length:1000             Min.   :  0.00   Min.   : 17.00  
##  Class :character   Class :character        1st Qu.: 57.00   1st Qu.: 59.00  
##  Mode  :character   Mode  :character        Median : 66.00   Median : 70.00  
##                                             Mean   : 66.09   Mean   : 69.17  
##                                             3rd Qu.: 77.00   3rd Qu.: 79.00  
##                                             Max.   :100.00   Max.   :100.00  
##  writing.score   
##  Min.   : 10.00  
##  1st Qu.: 57.75  
##  Median : 69.00  
##  Mean   : 68.05  
##  3rd Qu.: 79.00  
##  Max.   :100.00

structure of dataset

str(data)
## 'data.frame':    1000 obs. of  8 variables:
##  $ gender                     : chr  "female" "female" "female" "male" ...
##  $ race.ethnicity             : chr  "group B" "group C" "group B" "group A" ...
##  $ parental.level.of.education: chr  "bachelor's degree" "some college" "master's degree" "associate's degree" ...
##  $ lunch                      : chr  "standard" "standard" "standard" "free/reduced" ...
##  $ test.preparation.course    : chr  "none" "completed" "none" "none" ...
##  $ math.score                 : int  72 69 90 47 76 71 88 40 64 38 ...
##  $ reading.score              : int  72 90 95 57 78 83 95 43 64 60 ...
##  $ writing.score              : int  74 88 93 44 75 78 92 39 67 50 ...

data tokenization

Tokenization is the process of dividing text into a set of meaningful pieces.

data_tokens = unnest_tokens(data, word,parental.level.of.education)
head(data_tokens,100)

select and filter columns

tokens_stop=data_tokens %>% filter(!word %in% stop_words$word)
head(tokens_stop,100)

n grams

ng = data %>% unnest_tokens(word,parental.level.of.education, token = "ngrams", n=2)%>%
  separate(word, c("word1","word2"),sep = " ") %>% filter(!word1 %in% stop_words$word)%>% filter(!word2 %in% stop_words$word)%>% unite(word, word1,word2,sep=" ")%>% count(word, sort =TRUE)
head(ng,100)

seperate process

s=ng%>%separate(word,c("word1","word2"),sep = " ")
cn=s %>% filter(n>=2)
head(cn,100)

count

w_n=cn %>% count(word1,word2,directed=TRUE) %>% graph_from_data_frame()
w_n
## IGRAPH 0817723 DN-- 4 3 -- 
## + attr: name (v/c), directed (e/l), n (e/n)
## + edges from 0817723 (vertex names):
## [1] associate's->degree bachelor's ->degree master's   ->degree

ggplot

set.seed(20181005)
a=arrow(angle=20,length=unit(0.1,"inches"),ends="last",type="open")

ggraph(w_n,layout="fr")+geom_edge_link(aes(color=n,width=n),arrow=a)+
  geom_node_point()+ geom_node_text(aes(label=name),vjust=1,hjust=1)

frequency of words

word_freq=tokens_stop %>% count(word,sort = TRUE)
head(word_freq,100)

cloud

wordcloud2(data = word_freq, size=0.4, color = "random-light",minRotation = -pi/6,backgroundColor=rainbow(2), maxRotation = -pi/6, rotateRatio = 1,shape="circle")

insights

From the word cloud, we can easily visualise that education and degree are the words which have been quoted mostly by the students.

INFERENCES:

  1. From the above analysis of reviews about parent level of education.
  2. “DEGREE” has been the most appeared word in the reviews.

From the visualization and pre processing of the input reviews, it told that
a parent level of education is degree. It pre processing will prove.