Project Team : DeepfriedData
Bonnie Cooper
David Moste
Abdellah Ait
Gehad Gad
#Import libraries
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'tidyr' was built under R version 3.6.2
## Warning: package 'readr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.2
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts --------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(fmsb)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.6.3
## Loading required package: RColorBrewer
library(tm)
## Warning: package 'tm' was built under R version 3.6.3
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.6.2
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
#Import the indeed data from Bonnie Cooper's GitHub.
Data1indeed <-read.csv ("https://github.com/SmilodonCub/DATA607/raw/master/indeed1000_NLP.csv")
#Display the data.
head(Data1indeed)
## X doc_id paragraph_id sentence_id sentence token_id token
## 1 1 doc1 1 1 data 1 data
## 2 2 doc2 1 1 experience 1 experience
## 3 3 doc3 1 1 work 1 work
## 4 4 doc4 1 1 browse 1 browse
## 5 5 doc5 1 1 science 1 science
## 6 6 doc6 1 1 center 1 center
## lemma upos xpos feats head_token_id dep_rel deps
## 1 data NOUN NN Number=Sing 0 root NA
## 2 experience NOUN NN Number=Sing 0 root NA
## 3 work NOUN NN Number=Sing 0 root NA
## 4 browse NOUN NN Number=Sing 0 root NA
## 5 science NOUN NN Number=Sing 0 root NA
## 6 center NOUN NN Number=Sing 0 root NA
## misc value
## 1 SpacesAfter=\\n 813
## 2 SpacesAfter=\\n 481
## 3 SpacesAfter=\\n 335
## 4 SpacesAfter=\\n 299
## 5 SpacesAfter=\\n 222
## 6 SpacesAfter=\\n 207
#Delete/filter the data
Data2indeed <- Data1indeed[ ,c(5,16) ]
#Display the data.
head(Data2indeed)
## sentence value
## 1 data 813
## 2 experience 481
## 3 work 335
## 4 browse 299
## 5 science 222
## 6 center 207
summary(Data2indeed)
## sentence value
## ability : 1 Min. : 4.00
## able : 1 1st Qu.: 6.00
## academic: 1 Median : 9.00
## access : 1 Mean : 20.85
## accuracy: 1 3rd Qu.: 18.00
## accurate: 1 Max. :813.00
## (Other) :994
var(Data2indeed$value)
## [1] 1866.264
sd(Data2indeed$value)
## [1] 43.20028
#Import the reddit data
Data1reddit <- read.csv ("https://github.com/SmilodonCub/DATA607/raw/master/reddit1000_NLP.csv"
)
#Display the data
head(Data1reddit)
## X doc_id paragraph_id sentence_id sentence token_id token lemma upos
## 1 1 doc1 1 1 data 1 data data NOUN
## 2 2 doc2 1 1 work 1 work work NOUN
## 3 3 doc3 1 1 job 1 job job NOUN
## 4 4 doc4 1 1 people 1 people people NOUN
## 5 5 doc5 1 1 science 1 science science NOUN
## 6 6 doc6 1 1 know 1 know know VERB
## xpos feats head_token_id dep_rel deps misc value
## 1 NN Number=Sing 0 root NA SpacesAfter=\\n 2697
## 2 NN Number=Sing 0 root NA SpacesAfter=\\n 1533
## 3 NN Number=Sing 0 root NA SpacesAfter=\\n 1294
## 4 NNS Number=Plur 0 root NA SpacesAfter=\\n 1236
## 5 NN Number=Sing 0 root NA SpacesAfter=\\n 1232
## 6 VB VerbForm=Inf 0 root NA SpacesAfter=\\n 1212
## words Count
## 1 data 2697
## 2 work 1533
## 3 job 1294
## 4 people 1236
## 5 science 1232
## 6 know 1212
summary(Data2reddit)
## words Count
## ability : 1 Min. : 27.0
## absolutely: 1 1st Qu.: 37.0
## abstract : 1 Median : 59.0
## academia : 1 Mean : 113.2
## academic : 1 3rd Qu.: 117.2
## access : 1 Max. :2697.0
## (Other) :994
var(Data2reddit$Count)
## [1] 29785.06
sd(Data2reddit$Count)
## [1] 172.5835
#combine the two data in order to get a correlation.
Data3 <- cbind(Data2indeed, Data2reddit)
#Display the data
head(Data3)
## sentence value words Count
## 1 data 813 data 2697
## 2 experience 481 work 1533
## 3 work 335 job 1294
## 4 browse 299 people 1236
## 5 science 222 science 1232
## 6 center 207 know 1212
#change the columns sentence and words to numeric in order to get the correlation.
Data3$words <-as.numeric (Data3$words)
Data3$sentence <-as.numeric (Data3$sentence)
#Get the correlation between (sentence column from indeed) and (words column from reddit)
cor(Data3$sentence,Data3$words)
## [1] -0.03889494
Correlation shows how strongly the variables are related. The correlation ranges from -1.0 to +1.0. The closer the correlation (r) to +1 or -1, the more closely the two variables are related.
ggscatter(Data3, x= "sentence", y= "words", add = "reg.line", cor.coef = TRUE, conf.int = TRUE)
## `geom_smooth()` using formula 'y ~ x'