#Text Analysis of iWeb text data ## Contains the 10th value in a sample of 60,000 words.
library(dplyr)
library(stringr)
library(tidytext)
library(tidyverse)
library(lubridate)
library(scales)
library(corrplot)
library(Hmisc)
library(reshape2)
text_Df <- read.csv("C:/Users/HP/Desktop/iweb_wordFreq_sample/iWeb_lemma.txt", header = TRUE, sep="\t")
glimpse(text_Df)
## Observations: 6,045
## Variables: 8
## $ rank <int> 1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101, 111, 121, 131...
## $ wordID <int> 1, 10, 20, 30, 37, 50, 59, 71, 75, 90, 100, 111, 123, 130...
## $ word <fct> the, you, will, as, use, year, there, into, may, no, good...
## $ PoS <fct> a, p, v, c, v, n, e, i, v, a, j, d, v, c, i, n, j, v, m, ...
## $ frequency <int> 746240010, 131299336, 58588289, 40335062, 33469467, 25098...
## $ range <dbl> 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 0.98, 1.0...
## $ range10 <dbl> 1.00, 0.99, 1.00, 1.00, 0.98, 0.95, 0.94, 0.97, 0.93, 0.9...
## $ caps <dbl> 0.11, 0.12, 0.01, 0.11, 0.06, 0.04, 0.36, 0.00, 0.01, 0.1...
The Data contains 6045 observations and 8 variables, already ranked according to the frequency of each word, range and range10 variables, provide data on the frequency of a word in the millions of website scrapped, the data as a range pf 0 - 1 and will be used as our y variable, as our closest representation of readability. The x variable can be either be the word of PoS variable, since the latter is categorised and fairly finite, we would use this as explaining variable.
colSums(is.na(text_Df))
## rank wordID word PoS frequency range range10 caps
## 0 0 0 0 0 0 0 0
We have no missing values, data is clean as at this stage..
text_Df %>%
select(wordID, word, frequency) %>%
arrange(desc(frequency)) %>%
head(20)
text_Df %>%
group_by(PoS) %>%
summarise(n = sum(frequency)) %>%
arrange(desc(n))
## `summarise()` ungrouping output (override with `.groups` argument)
df2 <- text_Df %>%
group_by(PoS) %>%
summarise(avrRange = mean(range)) %>%
arrange(desc(avrRange))
## `summarise()` ungrouping output (override with `.groups` argument)
df2
df2 %>%
ggplot() +
geom_point(mapping = aes(x = reorder(PoS, -avrRange), y = avrRange, color = PoS)) +
ggtitle("Average Distribution of Range by PoS") +
xlab("Parts of Speech") +
ylab("Average Range")
#the reorder argument is to sort the data is and order, if I remove - from '-avrRange' then it plots in low-high order
# the next goal is to see if I can make the graph more legible.
The above is the sorted average distribution of Parts of Speech, in the sampled data, from this graph we can already determine the PoS that will prove influencial in our model.
This is so that the linear regression will recieve numerical values for each of our categorical PoS variable.
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
df3 <- dummy(df2$PoS, sep= ".")
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
df3 <- cbind(df3, Range = df2$avrRange) #Appending the "Range" our Dependent variable from previous data
df3
## C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.a
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 1
## [5,] 0
## [6,] 0
## [7,] 0
## [8,] 0
## [9,] 0
## [10,] 0
## [11,] 0
## [12,] 0
## C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.c
## [1,] 0
## [2,] 0
## [3,] 1
## [4,] 0
## [5,] 0
## [6,] 0
## [7,] 0
## [8,] 0
## [9,] 0
## [10,] 0
## [11,] 0
## [12,] 0
## C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.d
## [1,] 0
## [2,] 1
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## [7,] 0
## [8,] 0
## [9,] 0
## [10,] 0
## [11,] 0
## [12,] 0
## C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.e
## [1,] 1
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## [7,] 0
## [8,] 0
## [9,] 0
## [10,] 0
## [11,] 0
## [12,] 0
## C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.i
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 1
## [6,] 0
## [7,] 0
## [8,] 0
## [9,] 0
## [10,] 0
## [11,] 0
## [12,] 0
## C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.j
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## [7,] 0
## [8,] 0
## [9,] 0
## [10,] 0
## [11,] 0
## [12,] 1
## C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.m
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## [7,] 1
## [8,] 0
## [9,] 0
## [10,] 0
## [11,] 0
## [12,] 0
## C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.n
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## [7,] 0
## [8,] 0
## [9,] 0
## [10,] 0
## [11,] 1
## [12,] 0
## C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.p
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 1
## [7,] 0
## [8,] 0
## [9,] 0
## [10,] 0
## [11,] 0
## [12,] 0
## C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.r
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## [7,] 0
## [8,] 0
## [9,] 1
## [10,] 0
## [11,] 0
## [12,] 0
## C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.u
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## [7,] 0
## [8,] 0
## [9,] 0
## [10,] 1
## [11,] 0
## [12,] 0
## C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.v
## [1,] 0
## [2,] 0
## [3,] 0
## [4,] 0
## [5,] 0
## [6,] 0
## [7,] 0
## [8,] 1
## [9,] 0
## [10,] 0
## [11,] 0
## [12,] 0
## Range
## [1,] 1.00000000
## [2,] 0.83400000
## [3,] 0.71375000
## [4,] 0.59500000
## [5,] 0.46117647
## [6,] 0.42333333
## [7,] 0.15611111
## [8,] 0.09534570
## [9,] 0.06836858
## [10,] 0.05090909
## [11,] 0.03592172
## [12,] 0.02621349
#Convert to DataFrame
df3 <- as.data.frame(df3)
## build linear regression model.
linearMod2 <- lm(Range ~ ., data=df3)
print(linearMod2)
##
## Call:
## lm(formula = Range ~ ., data = df3)
##
## Coefficients:
## (Intercept)
## 0.09535
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.a`
## 0.49965
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.c`
## 0.61840
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.d`
## 0.73865
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.e`
## 0.90465
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.i`
## 0.36583
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.j`
## -0.06913
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.m`
## 0.06077
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.n`
## -0.05942
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.p`
## 0.32799
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.r`
## -0.02698
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.u`
## -0.04444
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.v`
## NA
From this result, we get a basic model for Range, which we assume to be an Readability Index, the NA result for PoS.v is an indicator that the value is very close to 0.
Any of those coefficients can be thier proportion in a document, how often does this “the” PoS.a appear in a document over the total words in that document.