#Text Analysis of iWeb text data ## Contains the 10th value in a sample of 60,000 words.

library(dplyr)
library(stringr)
library(tidytext)
library(tidyverse)
library(lubridate)
library(scales)
library(corrplot)
library(Hmisc)
library(reshape2)

Initiating Data Checks

text_Df <- read.csv("C:/Users/HP/Desktop/iweb_wordFreq_sample/iWeb_lemma.txt", header = TRUE, sep="\t")
glimpse(text_Df)
## Observations: 6,045
## Variables: 8
## $ rank      <int> 1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 101, 111, 121, 131...
## $ wordID    <int> 1, 10, 20, 30, 37, 50, 59, 71, 75, 90, 100, 111, 123, 130...
## $ word      <fct> the, you, will, as, use, year, there, into, may, no, good...
## $ PoS       <fct> a, p, v, c, v, n, e, i, v, a, j, d, v, c, i, n, j, v, m, ...
## $ frequency <int> 746240010, 131299336, 58588289, 40335062, 33469467, 25098...
## $ range     <dbl> 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 0.98, 1.0...
## $ range10   <dbl> 1.00, 0.99, 1.00, 1.00, 0.98, 0.95, 0.94, 0.97, 0.93, 0.9...
## $ caps      <dbl> 0.11, 0.12, 0.01, 0.11, 0.06, 0.04, 0.36, 0.00, 0.01, 0.1...

The Data contains 6045 observations and 8 variables, already ranked according to the frequency of each word, range and range10 variables, provide data on the frequency of a word in the millions of website scrapped, the data as a range pf 0 - 1 and will be used as our y variable, as our closest representation of readability. The x variable can be either be the word of PoS variable, since the latter is categorised and fairly finite, we would use this as explaining variable.

colSums(is.na(text_Df))
##      rank    wordID      word       PoS frequency     range   range10      caps 
##         0         0         0         0         0         0         0         0

We have no missing values, data is clean as at this stage..

Basic Data Summary

text_Df %>%
  select(wordID, word, frequency) %>%
  arrange(desc(frequency)) %>%
  head(20)

Parts of Speech in Descending order

text_Df %>%
  group_by(PoS) %>%
  summarise(n = sum(frequency)) %>%
  arrange(desc(n))
## `summarise()` ungrouping output (override with `.groups` argument)
df2 <- text_Df %>%
  group_by(PoS) %>%
  summarise(avrRange = mean(range)) %>%
  arrange(desc(avrRange))
## `summarise()` ungrouping output (override with `.groups` argument)
df2

Data Visualisation

df2 %>%
  ggplot() +
  geom_point(mapping = aes(x = reorder(PoS, -avrRange), y = avrRange, color = PoS)) +
  ggtitle("Average Distribution of Range by PoS") +
  xlab("Parts of Speech") +
  ylab("Average Range") 

#the reorder argument is to sort the data is and order, if I remove - from '-avrRange' then it plots in low-high order

# the next goal is to see if I can make the graph more legible.

The above is the sorted average distribution of Parts of Speech, in the sampled data, from this graph we can already determine the PoS that will prove influencial in our model.

Dummifying the Independent Factor variable

This is so that the linear regression will recieve numerical values for each of our categorical PoS variable.

library(dummies)
## dummies-1.5.6 provided by Decision Patterns
df3 <- dummy(df2$PoS, sep= ".")
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
df3 <- cbind(df3, Range = df2$avrRange) #Appending the "Range" our Dependent variable from previous data
df3
##       C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.a
##  [1,]                                                                                   0
##  [2,]                                                                                   0
##  [3,]                                                                                   0
##  [4,]                                                                                   1
##  [5,]                                                                                   0
##  [6,]                                                                                   0
##  [7,]                                                                                   0
##  [8,]                                                                                   0
##  [9,]                                                                                   0
## [10,]                                                                                   0
## [11,]                                                                                   0
## [12,]                                                                                   0
##       C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.c
##  [1,]                                                                                   0
##  [2,]                                                                                   0
##  [3,]                                                                                   1
##  [4,]                                                                                   0
##  [5,]                                                                                   0
##  [6,]                                                                                   0
##  [7,]                                                                                   0
##  [8,]                                                                                   0
##  [9,]                                                                                   0
## [10,]                                                                                   0
## [11,]                                                                                   0
## [12,]                                                                                   0
##       C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.d
##  [1,]                                                                                   0
##  [2,]                                                                                   1
##  [3,]                                                                                   0
##  [4,]                                                                                   0
##  [5,]                                                                                   0
##  [6,]                                                                                   0
##  [7,]                                                                                   0
##  [8,]                                                                                   0
##  [9,]                                                                                   0
## [10,]                                                                                   0
## [11,]                                                                                   0
## [12,]                                                                                   0
##       C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.e
##  [1,]                                                                                   1
##  [2,]                                                                                   0
##  [3,]                                                                                   0
##  [4,]                                                                                   0
##  [5,]                                                                                   0
##  [6,]                                                                                   0
##  [7,]                                                                                   0
##  [8,]                                                                                   0
##  [9,]                                                                                   0
## [10,]                                                                                   0
## [11,]                                                                                   0
## [12,]                                                                                   0
##       C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.i
##  [1,]                                                                                   0
##  [2,]                                                                                   0
##  [3,]                                                                                   0
##  [4,]                                                                                   0
##  [5,]                                                                                   1
##  [6,]                                                                                   0
##  [7,]                                                                                   0
##  [8,]                                                                                   0
##  [9,]                                                                                   0
## [10,]                                                                                   0
## [11,]                                                                                   0
## [12,]                                                                                   0
##       C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.j
##  [1,]                                                                                   0
##  [2,]                                                                                   0
##  [3,]                                                                                   0
##  [4,]                                                                                   0
##  [5,]                                                                                   0
##  [6,]                                                                                   0
##  [7,]                                                                                   0
##  [8,]                                                                                   0
##  [9,]                                                                                   0
## [10,]                                                                                   0
## [11,]                                                                                   0
## [12,]                                                                                   1
##       C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.m
##  [1,]                                                                                   0
##  [2,]                                                                                   0
##  [3,]                                                                                   0
##  [4,]                                                                                   0
##  [5,]                                                                                   0
##  [6,]                                                                                   0
##  [7,]                                                                                   1
##  [8,]                                                                                   0
##  [9,]                                                                                   0
## [10,]                                                                                   0
## [11,]                                                                                   0
## [12,]                                                                                   0
##       C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.n
##  [1,]                                                                                   0
##  [2,]                                                                                   0
##  [3,]                                                                                   0
##  [4,]                                                                                   0
##  [5,]                                                                                   0
##  [6,]                                                                                   0
##  [7,]                                                                                   0
##  [8,]                                                                                   0
##  [9,]                                                                                   0
## [10,]                                                                                   0
## [11,]                                                                                   1
## [12,]                                                                                   0
##       C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.p
##  [1,]                                                                                   0
##  [2,]                                                                                   0
##  [3,]                                                                                   0
##  [4,]                                                                                   0
##  [5,]                                                                                   0
##  [6,]                                                                                   1
##  [7,]                                                                                   0
##  [8,]                                                                                   0
##  [9,]                                                                                   0
## [10,]                                                                                   0
## [11,]                                                                                   0
## [12,]                                                                                   0
##       C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.r
##  [1,]                                                                                   0
##  [2,]                                                                                   0
##  [3,]                                                                                   0
##  [4,]                                                                                   0
##  [5,]                                                                                   0
##  [6,]                                                                                   0
##  [7,]                                                                                   0
##  [8,]                                                                                   0
##  [9,]                                                                                   1
## [10,]                                                                                   0
## [11,]                                                                                   0
## [12,]                                                                                   0
##       C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.u
##  [1,]                                                                                   0
##  [2,]                                                                                   0
##  [3,]                                                                                   0
##  [4,]                                                                                   0
##  [5,]                                                                                   0
##  [6,]                                                                                   0
##  [7,]                                                                                   0
##  [8,]                                                                                   0
##  [9,]                                                                                   0
## [10,]                                                                                   1
## [11,]                                                                                   0
## [12,]                                                                                   0
##       C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.v
##  [1,]                                                                                   0
##  [2,]                                                                                   0
##  [3,]                                                                                   0
##  [4,]                                                                                   0
##  [5,]                                                                                   0
##  [6,]                                                                                   0
##  [7,]                                                                                   0
##  [8,]                                                                                   1
##  [9,]                                                                                   0
## [10,]                                                                                   0
## [11,]                                                                                   0
## [12,]                                                                                   0
##            Range
##  [1,] 1.00000000
##  [2,] 0.83400000
##  [3,] 0.71375000
##  [4,] 0.59500000
##  [5,] 0.46117647
##  [6,] 0.42333333
##  [7,] 0.15611111
##  [8,] 0.09534570
##  [9,] 0.06836858
## [10,] 0.05090909
## [11,] 0.03592172
## [12,] 0.02621349
#Convert to DataFrame
df3 <- as.data.frame(df3)

## build linear regression model.
linearMod2 <- lm(Range ~ ., data=df3)  
print(linearMod2)
## 
## Call:
## lm(formula = Range ~ ., data = df3)
## 
## Coefficients:
##                                                                           (Intercept)  
##                                                                               0.09535  
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.a`  
##                                                                               0.49965  
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.c`  
##                                                                               0.61840  
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.d`  
##                                                                               0.73865  
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.e`  
##                                                                               0.90465  
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.i`  
##                                                                               0.36583  
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.j`  
##                                                                              -0.06913  
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.m`  
##                                                                               0.06077  
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.n`  
##                                                                              -0.05942  
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.p`  
##                                                                               0.32799  
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.r`  
##                                                                              -0.02698  
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.u`  
##                                                                              -0.04444  
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.v`  
##                                                                                    NA

From this result, we get a basic model for Range, which we assume to be an Readability Index, the NA result for PoS.v is an indicator that the value is very close to 0.

In simple terms:

Y = 0.09535 + 0.49965(PoS.a) + 0.61840(PoS.c) + 0.73865(PoS.d) + 0.90465(PoS.e) + 0.36583(PoS.i) + -0.06913(PoS.j) + 0.06077(PoS.m) + -0.05942(PoS.n) + 0.32799(PoS.p) + -0.02698(PoS.r) + -0.04444(PoS.u) + 0(PoS.v)

Any of those coefficients can be thier proportion in a document, how often does this “the” PoS.a appear in a document over the total words in that document.

Note: The PoS is determined by the first letter of the features of text, as described at http://ucrel.lancs.ac.uk/claws7tags.html

Here is where I am stuck, for some reason this is as due to some linearity in the data, so for now the analysis stops above.

## 
## Call:
## lm(formula = Range ~ ., data = df3)
## 
## Residuals:
## ALL 12 residuals are 0: no residual degrees of freedom!
## 
## Coefficients: (1 not defined because of singularities)
##                                                                                       Estimate
## (Intercept)                                                                            0.09535
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.a`  0.49965
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.c`  0.61840
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.d`  0.73865
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.e`  0.90465
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.i`  0.36583
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.j` -0.06913
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.m`  0.06077
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.n` -0.05942
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.p`  0.32799
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.r` -0.02698
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.u` -0.04444
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.v`       NA
##                                                                                       Std. Error
## (Intercept)                                                                                   NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.a`         NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.c`         NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.d`         NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.e`         NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.i`         NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.j`         NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.m`         NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.n`         NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.p`         NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.r`         NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.u`         NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.v`         NA
##                                                                                       t value
## (Intercept)                                                                                NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.a`      NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.c`      NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.d`      NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.e`      NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.i`      NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.j`      NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.m`      NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.n`      NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.p`      NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.r`      NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.u`      NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.v`      NA
##                                                                                       Pr(>|t|)
## (Intercept)                                                                                 NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.a`       NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.c`       NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.d`       NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.e`       NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.i`       NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.j`       NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.m`       NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.n`       NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.p`       NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.r`       NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.u`       NA
## `C:/Users/HP/Desktop/Sebacic/TEXT ANALYSIS/iWeb DataAnalysis/iWeb_TextAnalysis.Rmd.v`       NA
## 
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 11 and 0 DF,  p-value: NA
library(lattice)

xyplot(avrRange ~ PoS, data = df2, panel = function(x, y, ...) {
  panel.xyplot(x, y, ...)
  panel.lmline(x, y, col)
}) 

This is an aside but we get to see how this regression result mimics the first distribution graph from above.