Demonstration Code for analysis of PDF documents.

Demonstration code for extracting metrics from PDF files.

This is for a single document, similar code will be used to process an entire directory.

For more details on how this notebook works, please see

Practical_Text

suppressPackageStartupMessages(library("ggplot2")) # For viz
suppressPackageStartupMessages(library(pdftools))  # for reading PDF files
# https://cran.r-project.org/web/packages/pdftools/index.html
suppressPackageStartupMessages(library(lattice))   # for some graphics
suppressPackageStartupMessages(library(quanteda))  # for text_readability 
suppressPackageStartupMessages(library(syuzhet))   # For Sentiment 
suppressPackageStartupMessages(library(tictoc))    # For Timing
suppressPackageStartupMessages(library(udpipe))    # For nlp
#https://cran.r-project.org/web/packages/udpipe/index.html
 normalit<-function(m){
   (m - min(m,na.rm=TRUE))/(max(m,na.rm=TRUE)-min(m,na.rm=TRUE))
 }
tic()

filename <- '../PDF_Input/Calculus.pdf'
text <- pdf_text(filename)


toc()
## 2.36 sec elapsed
print(text[2])
## [1] "                         Contents\r\nCHAPTER   1       Introduction to Calculus\r\n        1.1 Velocity and Distance\r\n        1.2 Calculus Without Limits\r\n        1.3 The Velocity at an Instant\r\n        1.4 Circular Motion\r\n        1.5 A Review of Trigonometry\r\n        1.6 A Thousand Points of Light\r\n        1.7 Computing in Calculus\r\nCHAPTER   2       Derivatives\r\n            The Derivative of a Function\r\n            Powers and Polynomials\r\n            The Slope and the Tangent Line\r\n            Derivative of the Sine and Cosine\r\n            The Product and Quotient and Power Rules\r\n            Limits\r\n            Continuous Functions\r\nCHAPTER   3      Applications of the Derivative\r\n        3.1 Linear Approximation\r\n        3.2 Maximum and Minimum Problems\r\n        3.3 Second Derivatives: Minimum vs. Maximum\r\n        3.4 Graphs\r\n        3.5 Ellipses, Parabolas, and Hyperbolas\r\n        3.6                ,\r\n            Iterations x,+ = F(x,)\r\n        3.7 Newton's Method and Chaos\r\n        3.8 The Mean Value Theorem and l'H8pital's Rule\r\n"
tic()
section_size <- floor(length(text)/5)
final_bit <- length(text)-(section_size*5)
doc_id_text <- c(rep("Section 01",section_size),rep("Section 02",section_size) ,rep("Section 03",section_size),rep("Section 04",section_size),rep("Section 05",section_size+final_bit))

raw.df <- data.frame(doc_id=doc_id_text,raw_text=text,stringsAsFactors = FALSE)
toc()
## 0 sec elapsed
names(raw.df)
## [1] "doc_id"   "raw_text"
raw.df[c(3:5),]$raw_text
## [1] "                                Contents\r\nCHAPTER 4        The Chain Rule\r\n        4.1 Derivatives by the Chain Rule\r\n        4.2 Implicit Differentiation and Related Rates\r\n        4.3 Inverse Functions and Their Derivatives\r\n        4.4 Inverses of Trigonometric Functions\r\nCHAPTER   5      Integrals\r\n        5.1 The Idea of the Integral                            177\r\n        5.2 Antiderivatives                                     182\r\n        5.3 Summation vs. Integration                           187\r\n        5.4 Indefinite Integrals and Substitutions              195\r\n        5.5 The Definite Integral                               201\r\n        5.6 Properties of the Integral and the Average Value    206\r\n        5.7 The Fundamental Theorem and Its Consequences        213\r\n        5.8 Numerical Integration                               220\r\nCHAPTER   6      Exponentials and Logarithms\r\n        6.1 An Overview                                         228\r\n        6.2 The Exponential ex                                  236\r\n        6.3 Growth and Decay in Science and Economics           242\r\n        6.4 Logarithms                                          252\r\n        6.5 Separable Equations Including the Logistic Equation 259\r\n        6.6 Powers Instead of Exponentials                      267\r\n        6.7 Hyperbolic Functions                                277\r\nCHAPTER 7        Techniques of Integration\r\n        7.1 Integration by Parts\r\n        7.2 Trigonometric Integrals\r\n        7.3 Trigonometric Substitutions\r\n        7.4 Partial Fractions\r\n        7.5 Improper Integrals\r\nCHAPTER 8        Applications of the Integral\r\n        8.1 Areas and Volumes by Slices\r\n        8.2  Length of a Plane Curve\r\n        8.3 Area of a Surface of Revolution\r\n        8.4  Probability and Calculus\r\n        8.5  Masses and Moments\r\n        8.6  Force, Work, and Energy\r\n"
## [2] "                               Contents\r\nCHAPTER 9        Polar Coordinates and Complex Numbers\r\n       9.1  Polar Coordinates                          348\r\n       9.2  Polar Equations and Graphs                 351\r\n       9.3  Slope, Length, and Area for Polar Curves   356\r\n       9.4  Complex Numbers                            360\r\nCHAPTER 10       Infinite Series\r\n      10.1  The Geometric Series\r\n       10.2 Convergence Tests: Positive Series\r\n       10.3 Convergence Tests: All Series\r\n       10.4 The Taylor Series for ex, sin x, and cos x\r\n       10.5 Power Series\r\nCHAPTER 11       Vectors and Matrices\r\n      11.1  Vectors and Dot Products\r\n      11.2  Planes and Projections\r\n      11.3  Cross Products and Determinants\r\n      11.4  Matrices and Linear Equations\r\n      11.5  Linear Algebra in Three Dimensions\r\nCHAPTER 12       Motion along a Curve\r\n      12.1  The Position Vector                        446\r\n      12.2  Plane Motion: Projectiles and Cycloids     453\r\n      12.3  Tangent Vector and Normal Vector           459\r\n      12.4  Polar Coordinates and Planetary Motion     464\r\nCHAPTER 13       Partial Derivatives\r\n       13.1 Surfaces and Level Curves                  472\r\n       13.2 Partial Derivatives                        475\r\n       13.3 Tangent Planes and Linear Approximations   480\r\n       13.4 Directional Derivatives and Gradients      490\r\n       13.5 The Chain Rule                             497\r\n       13.6 Maxima, Minima, and Saddle Points          504\r\n       13.7 Constraints and Lagrange Multipliers       514\r\n"                                                                                                                                                                                                                                                                                                                                           
## [3] "                               Contents\r\nCHAPTER 14       Multiple Integrals\r\n       14.1 Double Integrals\r\n       14.2 Changing to Better Coordinates\r\n       14.3 Triple Integrals\r\n       14.4 Cylindrical and Spherical Coordinates\r\nCHAPTER 15       Vector Calculus\r\n       15.1 Vector Fields\r\n       15.2 Line Integrals\r\n       15.3 Green's Theorem\r\n       15.4 Surface Integrals\r\n       15.5 The Divergence Theorem\r\n       15.6 Stokes' Theorem and the Curl of F\r\nCHAPTER 16       Mathematics after Calculus\r\n       16.1 Linear Algebra\r\n       16.2 Differential Equations\r\n       16.3 Discrete Mathematics\r\n            Study Guide For Chapter 1\r\n            Answers to Odd-Numbered Problems\r\n            Index\r\n            Table of Integrals\r\n"
tic()
tr.df <- textstat_readability(corpus(raw.df,docid_field="doc_id",text_field="raw_text"))

spl <- strsplit(as.character(tr.df$document), "\\.")
tr.df$section <- sapply(lapply(spl,head,-1),paste,collapse="\\.")
tr.df$subsection <- sapply(lapply(spl,tail,-1),paste,collapse="\\.")
toc()
## 5.47 sec elapsed
names(tr.df)
##  [1] "document"              "ARI"                  
##  [3] "ARI.simple"            "Bormuth"              
##  [5] "Bormuth.GP"            "Coleman"              
##  [7] "Coleman.C2"            "Coleman.Liau"         
##  [9] "Coleman.Liau.grade"    "Coleman.Liau.short"   
## [11] "Dale.Chall"            "Dale.Chall.old"       
## [13] "Dale.Chall.PSK"        "Danielson.Bryan"      
## [15] "Danielson.Bryan.2"     "Dickes.Steiwer"       
## [17] "DRP"                   "ELF"                  
## [19] "Farr.Jenkins.Paterson" "Flesch"               
## [21] "Flesch.PSK"            "Flesch.Kincaid"       
## [23] "FOG"                   "FOG.PSK"              
## [25] "FOG.NRI"               "FORCAST"              
## [27] "FORCAST.RGL"           "Fucks"                
## [29] "Linsear.Write"         "LIW"                  
## [31] "nWS"                   "nWS.2"                
## [33] "nWS.3"                 "nWS.4"                
## [35] "RIX"                   "Scrabble"             
## [37] "SMOG"                  "SMOG.C"               
## [39] "SMOG.simple"           "SMOG.de"              
## [41] "Spache"                "Spache.old"           
## [43] "Strain"                "Traenkle.Bailer"      
## [45] "Traenkle.Bailer.2"     "Wheeler.Smith"        
## [47] "meanSentenceLength"    "meanWordSyllables"    
## [49] "section"               "subsection"
inaugReadability <- textstat_readability(data_corpus_inaugural, "all")
cor(inaugReadability[,-1])
##                               ARI   ARI.simple     Bormuth  Bormuth.GP
## ARI                    1.00000000  0.999985813 -0.99188017  0.90232231
## ARI.simple             0.99998581  1.000000000 -0.99254312  0.90363285
## Bormuth               -0.99188017 -0.992543125  1.00000000 -0.92665498
## Bormuth.GP             0.90232231  0.903632850 -0.92665498  1.00000000
## Coleman               -0.57842561 -0.574425719  0.47841665 -0.34186891
## Coleman.C2            -0.76781514 -0.764705749  0.68780777 -0.51669363
## Coleman.Liau          -0.80785899 -0.804736665  0.72721217 -0.55936385
## Coleman.Liau.grade     0.80785899  0.804736665 -0.72721217  0.55936385
## Coleman.Liau.short     0.80788731  0.804765212 -0.72724574  0.55939187
## Dale.Chall            -0.99099423 -0.991693434  0.99996713 -0.92746308
## Dale.Chall.old         0.99099423  0.991693434 -0.99996713  0.92746308
## Dale.Chall.PSK         0.99099423  0.991693434 -0.99996713  0.92746308
## Danielson.Bryan        0.99708146  0.996677212 -0.97962805  0.88486422
## Danielson.Bryan.2     -0.31030344 -0.305238155  0.18706023 -0.04006627
## Dickes.Steiwer        -0.99844204 -0.998703929  0.99691688 -0.91329434
## DRP                    0.99188017  0.992543125 -1.00000000  0.92665498
## ELF                    0.99594813  0.995835853 -0.98541527  0.89837506
## Farr.Jenkins.Paterson -0.99154201 -0.992219215  0.99998536 -0.92696268
## Flesch                -0.96831214 -0.967086924  0.93162624 -0.82257543
## Flesch.PSK             0.98543988  0.984630595 -0.95852762  0.85630865
## Flesch.Kincaid         0.99828644  0.998096618 -0.98602857  0.89508980
## FOG                    0.99534334  0.995012384 -0.97973311  0.88272558
## FOG.PSK                0.99804952  0.998321809 -0.99679394  0.91366411
## FOG.NRI                0.49053097  0.489283365 -0.45717614  0.33617976
## FORCAST                0.57842561  0.574425719 -0.47841665  0.34186891
## FORCAST.RGL            0.57842561  0.574425719 -0.47841665  0.34186891
## Fucks                  0.99773065  0.998049346 -0.99751520  0.91989373
## Linsear.Write          0.98324412  0.982726746 -0.96316927  0.88236447
## LIW                    0.99534363  0.994936160 -0.97782198  0.88013733
## nWS                    0.97160116  0.970438242 -0.93628326  0.82182649
## nWS.2                  0.97550799  0.974435314 -0.94231463  0.82921349
## nWS.3                  0.98501356  0.984272471 -0.95970981  0.85402853
## nWS.4                  0.99487148  0.994513256 -0.97861519  0.88101179
## RIX                    0.99222014  0.991916139 -0.97704126  0.89115856
## Scrabble              -0.01121825 -0.009365094 -0.03334969 -0.01582000
## SMOG                   0.97393941  0.972959989 -0.94303306  0.82113760
## SMOG.C                 0.97457741  0.973614539 -0.94405248  0.82339465
## SMOG.simple            0.97393941  0.972959989 -0.94303306  0.82113760
## SMOG.de                0.97393941  0.972959989 -0.94303306  0.82113760
## Spache                 0.99099423  0.991693434 -0.99996713  0.92746308
## Spache.old             0.99099423  0.991693434 -0.99996713  0.92746308
## Strain                 0.99760642  0.997863573 -0.99596056  0.92013679
## Traenkle.Bailer       -0.99922385 -0.999411499  0.99593169 -0.91015126
## Traenkle.Bailer.2     -0.75491779 -0.751448659  0.66631519 -0.51471861
## Wheeler.Smith          0.99594813  0.995835853 -0.98541527  0.89837506
## meanSentenceLength     0.99099423  0.991693434 -0.99996713  0.92746308
## meanWordSyllables      0.67621719  0.672476162 -0.58187505  0.44795440
##                          Coleman Coleman.C2 Coleman.Liau
## ARI                   -0.5784256 -0.7678151   -0.8078590
## ARI.simple            -0.5744257 -0.7647057   -0.8047367
## Bormuth                0.4784167  0.6878078    0.7272122
## Bormuth.GP            -0.3418689 -0.5166936   -0.5593638
## Coleman                1.0000000  0.9578621    0.9027073
## Coleman.C2             0.9578621  1.0000000    0.9676044
## Coleman.Liau           0.9027073  0.9676044    1.0000000
## Coleman.Liau.grade    -0.9027073 -0.9676044   -1.0000000
## Coleman.Liau.short    -0.9026909 -0.9676038   -1.0000000
## Dale.Chall             0.4728710  0.6830083    0.7223808
## Dale.Chall.old        -0.4728710 -0.6830083   -0.7223808
## Dale.Chall.PSK        -0.4728710 -0.6830083   -0.7223808
## Danielson.Bryan       -0.6320177 -0.8068048   -0.8471402
## Danielson.Bryan.2      0.8922077  0.7931428    0.8066665
## Dickes.Steiwer         0.5390065  0.7368282    0.7764052
## DRP                   -0.4784167 -0.6878078   -0.7272122
## ELF                   -0.6127006 -0.7903882   -0.8134246
## Farr.Jenkins.Paterson  0.4768093  0.6862258    0.7252233
## Flesch                 0.7524555  0.8921701    0.9145879
## Flesch.PSK            -0.6991742 -0.8564052   -0.8829007
## Flesch.Kincaid        -0.6124186 -0.7933597   -0.8255002
## FOG                   -0.6279137 -0.8051202   -0.8390619
## FOG.PSK               -0.5401789 -0.7376615   -0.7749720
## FOG.NRI               -0.4679026 -0.5350724   -0.5426918
## FORCAST               -1.0000000 -0.9578621   -0.9027073
## FORCAST.RGL           -1.0000000 -0.9578621   -0.9027073
## Fucks                 -0.5305954 -0.7279653   -0.7681848
## Linsear.Write         -0.6540873 -0.8139821   -0.8446074
## LIW                   -0.6415138 -0.8144833   -0.8466378
## nWS                   -0.7435480 -0.8869623   -0.9108590
## nWS.2                 -0.7297189 -0.8775918   -0.9042061
## nWS.3                 -0.6826246 -0.8444418   -0.8757884
## nWS.4                 -0.6316756 -0.8078997   -0.8416766
## RIX                   -0.6292106 -0.7992488   -0.8296056
## Scrabble               0.3522408  0.2472068    0.1918831
## SMOG                  -0.7133315 -0.8681187   -0.8952482
## SMOG.C                -0.7113151 -0.8663610   -0.8936817
## SMOG.simple           -0.7133315 -0.8681187   -0.8952482
## SMOG.de               -0.7133315 -0.8681187   -0.8952482
## Spache                -0.4728710 -0.6830083   -0.7223808
## Spache.old            -0.4728710 -0.6830083   -0.7223808
## Strain                -0.5461293 -0.7394040   -0.7744192
## Traenkle.Bailer        0.5491468  0.7451720    0.7853284
## Traenkle.Bailer.2      0.9206771  0.9574552    0.9904793
## Wheeler.Smith         -0.6127006 -0.7903882   -0.8134246
## meanSentenceLength    -0.4728710 -0.6830083   -0.7223808
## meanWordSyllables     -0.9645525 -0.9629188   -0.9543023
##                       Coleman.Liau.grade Coleman.Liau.short  Dale.Chall
## ARI                            0.8078590          0.8078873 -0.99099423
## ARI.simple                     0.8047367          0.8047652 -0.99169343
## Bormuth                       -0.7272122         -0.7272457  0.99996713
## Bormuth.GP                     0.5593638          0.5593919 -0.92746308
## Coleman                       -0.9027073         -0.9026909  0.47287100
## Coleman.C2                    -0.9676044         -0.9676038  0.68300831
## Coleman.Liau                  -1.0000000         -1.0000000  0.72238083
## Coleman.Liau.grade             1.0000000          1.0000000 -0.72238083
## Coleman.Liau.short             1.0000000          1.0000000 -0.72241463
## Dale.Chall                    -0.7223808         -0.7224146  1.00000000
## Dale.Chall.old                 0.7223808          0.7224146 -1.00000000
## Dale.Chall.PSK                 0.7223808          0.7224146 -1.00000000
## Danielson.Bryan                0.8471402          0.8471650 -0.97829531
## Danielson.Bryan.2             -0.8066665         -0.8066351  0.18028533
## Dickes.Steiwer                -0.7764052         -0.7764357  0.99638999
## DRP                            0.7272122          0.7272457 -0.99996713
## ELF                            0.8134246          0.8134514 -0.98451149
## Farr.Jenkins.Paterson         -0.7252233         -0.7252569  0.99998999
## Flesch                        -0.9145879         -0.9146054  0.92913685
## Flesch.PSK                     0.8829007          0.8829221 -0.95657238
## Flesch.Kincaid                 0.8255002          0.8255270 -0.98488025
## FOG                            0.8390619          0.8390877 -0.97841468
## FOG.PSK                        0.7749720          0.7750026 -0.99626213
## FOG.NRI                        0.5426918          0.5426992 -0.45492516
## FORCAST                        0.9027073          0.9026909 -0.47287100
## FORCAST.RGL                    0.9027073          0.9026909 -0.47287100
## Fucks                          0.7681848          0.7682153 -0.99711281
## Linsear.Write                  0.8446074          0.8446295 -0.96173384
## LIW                            0.8466378          0.8466626 -0.97649153
## nWS                            0.9108590          0.9108773 -0.93396357
## nWS.2                          0.9042061          0.9042253 -0.94010518
## nWS.3                          0.8757884          0.8758106 -0.95786413
## nWS.4                          0.8416766          0.8417021 -0.97726151
## RIX                            0.8296056          0.8296301 -0.97599601
## Scrabble                      -0.1918831         -0.1918618 -0.03546462
## SMOG                           0.8952482          0.8952689 -0.94089430
## SMOG.C                         0.8936817          0.8937024 -0.94194286
## SMOG.simple                    0.8952482          0.8952689 -0.94089430
## SMOG.de                        0.8952482          0.8952689 -0.94089430
## Spache                         0.7223808          0.7224146 -1.00000000
## Spache.old                     0.7223808          0.7224146 -1.00000000
## Strain                         0.7744192          0.7744491 -0.99544237
## Traenkle.Bailer               -0.7853284         -0.7853584  0.99529866
## Traenkle.Bailer.2             -0.9904793         -0.9904734  0.66117918
## Wheeler.Smith                  0.8134246          0.8134514 -0.98451149
## meanSentenceLength             0.7223808          0.7224146 -1.00000000
## meanWordSyllables              0.9543023          0.9542911 -0.57632421
##                       Dale.Chall.old Dale.Chall.PSK Danielson.Bryan
## ARI                       0.99099423     0.99099423      0.99708146
## ARI.simple                0.99169343     0.99169343      0.99667721
## Bormuth                  -0.99996713    -0.99996713     -0.97962805
## Bormuth.GP                0.92746308     0.92746308      0.88486422
## Coleman                  -0.47287100    -0.47287100     -0.63201768
## Coleman.C2               -0.68300831    -0.68300831     -0.80680483
## Coleman.Liau             -0.72238083    -0.72238083     -0.84714024
## Coleman.Liau.grade        0.72238083     0.72238083      0.84714024
## Coleman.Liau.short        0.72241463     0.72241463      0.84716497
## Dale.Chall               -1.00000000    -1.00000000     -0.97829531
## Dale.Chall.old            1.00000000     1.00000000      0.97829531
## Dale.Chall.PSK            1.00000000     1.00000000      0.97829531
## Danielson.Bryan           0.97829531     0.97829531      1.00000000
## Danielson.Bryan.2        -0.18028533    -0.18028533     -0.37865977
## Dickes.Steiwer           -0.99638999    -0.99638999     -0.99178586
## DRP                       0.99996713     0.99996713      0.97962805
## ELF                       0.98451149     0.98451149      0.99535015
## Farr.Jenkins.Paterson    -0.99998999    -0.99998999     -0.97914589
## Flesch                   -0.92913685    -0.92913685     -0.98231683
## Flesch.PSK                0.95657238     0.95657238      0.99363576
## Flesch.Kincaid            0.98488025     0.98488025      0.99786963
## FOG                       0.97841468     0.97841468      0.99695239
## FOG.PSK                   0.99626213     0.99626213      0.99126431
## FOG.NRI                   0.45492516     0.45492516      0.50283748
## FORCAST                   0.47287100     0.47287100      0.63201768
## FORCAST.RGL               0.47287100     0.47287100      0.63201768
## Fucks                     0.99711281     0.99711281      0.99077571
## Linsear.Write             0.96173384     0.96173384      0.98940268
## LIW                       0.97649153     0.97649153      0.99817496
## nWS                       0.93396357     0.93396357      0.98486561
## nWS.2                     0.94010518     0.94010518      0.98751970
## nWS.3                     0.95786413     0.95786413      0.99235519
## nWS.4                     0.97726151     0.97726151      0.99686096
## RIX                       0.97599601     0.97599601      0.99500484
## Scrabble                  0.03546462     0.03546462     -0.03679625
## SMOG                      0.94089430     0.94089430      0.98460337
## SMOG.C                    0.94194286     0.94194286      0.98509384
## SMOG.simple               0.94089430     0.94089430      0.98460337
## SMOG.de                   0.94089430     0.94089430      0.98460337
## Spache                    1.00000000     1.00000000      0.97829531
## Spache.old                1.00000000     1.00000000      0.97829531
## Strain                    0.99544237     0.99544237      0.99164686
## Traenkle.Bailer          -0.99529866    -0.99529866     -0.99349190
## Traenkle.Bailer.2        -0.66117918    -0.66117918     -0.79995865
## Wheeler.Smith             0.98451149     0.98451149      0.99535015
## meanSentenceLength        1.00000000     1.00000000      0.97829531
## meanWordSyllables         0.57632421     0.57632421      0.72593204
##                       Danielson.Bryan.2 Dickes.Steiwer         DRP
## ARI                         -0.31030344    -0.99844204  0.99188017
## ARI.simple                  -0.30523815    -0.99870393  0.99254312
## Bormuth                      0.18706023     0.99691688 -1.00000000
## Bormuth.GP                  -0.04006627    -0.91329434  0.92665498
## Coleman                      0.89220768     0.53900648 -0.47841665
## Coleman.C2                   0.79314280     0.73682822 -0.68780777
## Coleman.Liau                 0.80666655     0.77640516 -0.72721217
## Coleman.Liau.grade          -0.80666655    -0.77640516  0.72721217
## Coleman.Liau.short          -0.80663507    -0.77643567  0.72724574
## Dale.Chall                   0.18028533     0.99638999 -0.99996713
## Dale.Chall.old              -0.18028533    -0.99638999  0.99996713
## Dale.Chall.PSK              -0.18028533    -0.99638999  0.99996713
## Danielson.Bryan             -0.37865977    -0.99178586  0.97962805
## Danielson.Bryan.2            1.00000000     0.26101392 -0.18706023
## Dickes.Steiwer               0.26101392     1.00000000 -0.99691688
## DRP                         -0.18706023    -0.99691688  1.00000000
## ELF                         -0.32598561    -0.99345440  0.98541527
## Farr.Jenkins.Paterson        0.18438177     0.99672456 -0.99998536
## Flesch                       0.51737927     0.95524130 -0.93162624
## Flesch.PSK                  -0.44828841    -0.97634394  0.95852762
## Flesch.Kincaid              -0.34148774    -0.99512741  0.98602857
## FOG                         -0.36653699    -0.99112313  0.97973311
## FOG.PSK                     -0.25909248    -0.99945426  0.99679394
## FOG.NRI                     -0.36711298    -0.46497779  0.45717614
## FORCAST                     -0.89220768    -0.53900648  0.47841665
## FORCAST.RGL                 -0.89220768    -0.53900648  0.47841665
## Fucks                       -0.24974568    -0.99940700  0.99751520
## Linsear.Write               -0.39525523    -0.97750750  0.96316927
## LIW                         -0.37953455    -0.99028645  0.97782198
## nWS                         -0.50773654    -0.95947137  0.93628326
## nWS.2                       -0.49287612    -0.96426773  0.94231463
## nWS.3                       -0.43696629    -0.97697873  0.95970981
## nWS.4                       -0.37128519    -0.99039744  0.97861519
## RIX                         -0.35912065    -0.98825075  0.97704126
## Scrabble                     0.32114325    -0.01434789  0.03334969
## SMOG                        -0.47640589    -0.96367176  0.94303306
## SMOG.C                      -0.47358728    -0.96447120  0.94405248
## SMOG.simple                 -0.47640589    -0.96367176  0.94303306
## SMOG.de                     -0.47640589    -0.96367176  0.94303306
## Spache                      -0.18028533    -0.99638999  0.99996713
## Spache.old                  -0.18028533    -0.99638999  0.99996713
## Strain                      -0.26089612    -0.99875609  0.99596056
## Traenkle.Bailer              0.27409970     0.99959449 -0.99593169
## Traenkle.Bailer.2            0.85092550     0.71952902 -0.66631519
## Wheeler.Smith               -0.32598561    -0.99345440  0.98541527
## meanSentenceLength          -0.18028533    -0.99638999  0.99996713
## meanWordSyllables           -0.87721338    -0.63935562  0.58187505
##                               ELF Farr.Jenkins.Paterson     Flesch
## ARI                    0.99594813           -0.99154201 -0.9683121
## ARI.simple             0.99583585           -0.99221922 -0.9670869
## Bormuth               -0.98541527            0.99998536  0.9316262
## Bormuth.GP             0.89837506           -0.92696268 -0.8225754
## Coleman               -0.61270055            0.47680930  0.7524555
## Coleman.C2            -0.79038825            0.68622584  0.8921701
## Coleman.Liau          -0.81342458            0.72522331  0.9145879
## Coleman.Liau.grade     0.81342458           -0.72522331 -0.9145879
## Coleman.Liau.short     0.81345135           -0.72525694 -0.9146054
## Dale.Chall            -0.98451149            0.99998999  0.9291368
## Dale.Chall.old         0.98451149           -0.99998999 -0.9291368
## Dale.Chall.PSK         0.98451149           -0.99998999 -0.9291368
## Danielson.Bryan        0.99535015           -0.97914589 -0.9823168
## Danielson.Bryan.2     -0.32598561            0.18438177  0.5173793
## Dickes.Steiwer        -0.99345440            0.99672456  0.9552413
## DRP                    0.98541527           -0.99998536 -0.9316262
## ELF                    1.00000000           -0.98524897 -0.9737553
## Farr.Jenkins.Paterson -0.98524897            1.00000000  0.9307176
## Flesch                -0.97375531            0.93071764  1.0000000
## Flesch.PSK             0.98827872           -0.95781641 -0.9965628
## Flesch.Kincaid         0.99727425           -0.98561541 -0.9791403
## FOG                    0.99443432           -0.97924413 -0.9834416
## FOG.PSK                0.99386237           -0.99660297 -0.9567470
## FOG.NRI                0.49527062           -0.45620440 -0.5313693
## FORCAST                0.61270055           -0.47680930 -0.7524555
## FORCAST.RGL            0.61270055           -0.47680930 -0.7524555
## Fucks                  0.99362322           -0.99740292 -0.9519647
## Linsear.Write          0.98844709           -0.96273644 -0.9842487
## LIW                    0.99591724           -0.97739468 -0.9839516
## nWS                    0.97678636           -0.93548749 -0.9980699
## nWS.2                  0.97966902           -0.94154405 -0.9974721
## nWS.3                  0.98635632           -0.95902099 -0.9933395
## nWS.4                  0.99411156           -0.97811285 -0.9843084
## RIX                    0.99677384           -0.97683788 -0.9779743
## Scrabble              -0.04113608           -0.03359018  0.1056726
## SMOG                   0.97747350           -0.94224804 -0.9945369
## SMOG.C                 0.97820253           -0.94328383 -0.9944270
## SMOG.simple            0.97747350           -0.94224804 -0.9945369
## SMOG.de                0.97747350           -0.94224804 -0.9945369
## Spache                 0.98451149           -0.99998999 -0.9291368
## Spache.old             0.98451149           -0.99998999 -0.9291368
## Strain                 0.99576469           -0.99581541 -0.9579642
## Traenkle.Bailer       -0.99437253            0.99568736  0.9590869
## Traenkle.Bailer.2     -0.76363493            0.66426051  0.8800874
## Wheeler.Smith          1.00000000           -0.98524897 -0.9737553
## meanSentenceLength     0.98451149           -0.99998999 -0.9291368
## meanWordSyllables      0.69782528           -0.57983300 -0.8376402
##                        Flesch.PSK Flesch.Kincaid         FOG    FOG.PSK
## ARI                    0.98543988     0.99828644  0.99534334  0.9980495
## ARI.simple             0.98463060     0.99809662  0.99501238  0.9983218
## Bormuth               -0.95852762    -0.98602857 -0.97973311 -0.9967939
## Bormuth.GP             0.85630865     0.89508980  0.88272558  0.9136641
## Coleman               -0.69917416    -0.61241856 -0.62791369 -0.5401789
## Coleman.C2            -0.85640522    -0.79335974 -0.80512019 -0.7376615
## Coleman.Liau          -0.88290067    -0.82550024 -0.83906193 -0.7749720
## Coleman.Liau.grade     0.88290067     0.82550024  0.83906193  0.7749720
## Coleman.Liau.short     0.88292206     0.82552702  0.83908765  0.7750026
## Dale.Chall            -0.95657238    -0.98488025 -0.97841468 -0.9962621
## Dale.Chall.old         0.95657238     0.98488025  0.97841468  0.9962621
## Dale.Chall.PSK         0.95657238     0.98488025  0.97841468  0.9962621
## Danielson.Bryan        0.99363576     0.99786963  0.99695239  0.9912643
## Danielson.Bryan.2     -0.44828841    -0.34148774 -0.36653699 -0.2590925
## Dickes.Steiwer        -0.97634394    -0.99512741 -0.99112313 -0.9994543
## DRP                    0.95852762     0.98602857  0.97973311  0.9967939
## ELF                    0.98827872     0.99727425  0.99443432  0.9938624
## Farr.Jenkins.Paterson -0.95781641    -0.98561541 -0.97924413 -0.9966030
## Flesch                -0.99656281    -0.97914032 -0.98344165 -0.9567470
## Flesch.PSK             1.00000000     0.99260681  0.99454929  0.9775024
## Flesch.Kincaid         0.99260681     1.00000000  0.99846226  0.9957626
## FOG                    0.99454929     0.99846226  1.00000000  0.9926083
## FOG.PSK                0.97750238     0.99576264  0.99260833  1.0000000
## FOG.NRI                0.52085197     0.49896862  0.50751706  0.4793132
## FORCAST                0.69917416     0.61241856  0.62791369  0.5401789
## FORCAST.RGL            0.69917416     0.61241856  0.62791369  0.5401789
## Fucks                  0.97392268     0.99398941  0.98936219  0.9991427
## Linsear.Write          0.99144816     0.98967358  0.99372655  0.9801897
## LIW                    0.99452041     0.99764432  0.99741442  0.9903981
## nWS                    0.99612259     0.98088847  0.98637622  0.9608083
## nWS.2                  0.99702731     0.98398342  0.98925526  0.9656186
## nWS.3                  0.99774824     0.99180645  0.99654303  0.9790944
## nWS.4                  0.99497427     0.99823466  0.99998483  0.9919248
## RIX                    0.98969704     0.99457144  0.99391070  0.9886425
## Scrabble              -0.07536487    -0.03002267 -0.02275926  0.0113141
## SMOG                   0.99489006     0.98304183  0.99004142  0.9664107
## SMOG.C                 0.99503836     0.98356657  0.99047862  0.9672092
## SMOG.simple            0.99489006     0.98304183  0.99004142  0.9664107
## SMOG.de                0.99489006     0.98304183  0.99004142  0.9664107
## Spache                 0.95657238     0.98488025  0.97841468  0.9962621
## Spache.old             0.95657238     0.98488025  0.97841468  0.9962621
## Strain                 0.97827829     0.99588242  0.99196627  0.9992502
## Traenkle.Bailer       -0.97913122    -0.99632949 -0.99246787 -0.9993755
## Traenkle.Bailer.2     -0.84198853    -0.77570251 -0.79053472 -0.7187449
## Wheeler.Smith          0.98827872     0.99727425  0.99443432  0.9938624
## meanSentenceLength     0.95657238     0.98488025  0.97841468  0.9962621
## meanWordSyllables      0.78951164     0.70918290  0.72824185  0.6428726
##                          FOG.NRI    FORCAST FORCAST.RGL        Fucks
## ARI                    0.4905310  0.5784256   0.5784256  0.997730647
## ARI.simple             0.4892834  0.5744257   0.5744257  0.998049346
## Bormuth               -0.4571761 -0.4784167  -0.4784167 -0.997515204
## Bormuth.GP             0.3361798  0.3418689   0.3418689  0.919893732
## Coleman               -0.4679026 -1.0000000  -1.0000000 -0.530595415
## Coleman.C2            -0.5350724 -0.9578621  -0.9578621 -0.727965258
## Coleman.Liau          -0.5426918 -0.9027073  -0.9027073 -0.768184772
## Coleman.Liau.grade     0.5426918  0.9027073   0.9027073  0.768184772
## Coleman.Liau.short     0.5426992  0.9026909   0.9026909  0.768215251
## Dale.Chall            -0.4549252 -0.4728710  -0.4728710 -0.997112805
## Dale.Chall.old         0.4549252  0.4728710   0.4728710  0.997112805
## Dale.Chall.PSK         0.4549252  0.4728710   0.4728710  0.997112805
## Danielson.Bryan        0.5028375  0.6320177   0.6320177  0.990775706
## Danielson.Bryan.2     -0.3671130 -0.8922077  -0.8922077 -0.249745678
## Dickes.Steiwer        -0.4649778 -0.5390065  -0.5390065 -0.999407002
## DRP                    0.4571761  0.4784167   0.4784167  0.997515204
## ELF                    0.4952706  0.6127006   0.6127006  0.993623225
## Farr.Jenkins.Paterson -0.4562044 -0.4768093  -0.4768093 -0.997402917
## Flesch                -0.5313693 -0.7524555  -0.7524555 -0.951964715
## Flesch.PSK             0.5208520  0.6991742   0.6991742  0.973922677
## Flesch.Kincaid         0.4989686  0.6124186   0.6124186  0.993989409
## FOG                    0.5075171  0.6279137   0.6279137  0.989362193
## FOG.PSK                0.4793132  0.5401789   0.5401789  0.999142674
## FOG.NRI                1.0000000  0.4679026   0.4679026  0.472326429
## FORCAST                0.4679026  1.0000000   1.0000000  0.530595415
## FORCAST.RGL            0.4679026  1.0000000   1.0000000  0.530595415
## Fucks                  0.4723264  0.5305954   0.5305954  1.000000000
## Linsear.Write          0.5059810  0.6540873   0.6540873  0.977258794
## LIW                    0.5041603  0.6415138   0.6415138  0.988738238
## nWS                    0.5303825  0.7435480   0.7435480  0.956043120
## nWS.2                  0.5280599  0.7297189   0.7297189  0.960995665
## nWS.3                  0.5225015  0.6826246   0.6826246  0.974240643
## nWS.4                  0.5086192  0.6316756   0.6316756  0.988571335
## RIX                    0.4923100  0.6292106   0.6292106  0.988410496
## Scrabble              -0.1428063 -0.3522408  -0.3522408  0.008523205
## SMOG                   0.5367565  0.7133315   0.7133315  0.960356032
## SMOG.C                 0.5360437  0.7113151   0.7113151  0.961281850
## SMOG.simple            0.5367565  0.7133315   0.7133315  0.960356032
## SMOG.de                0.5367565  0.7133315   0.7133315  0.960356032
## Spache                 0.4549252  0.4728710   0.4728710  0.997112805
## Spache.old             0.4549252  0.4728710   0.4728710  0.997112805
## Strain                 0.4766100  0.5461293   0.5461293  0.999278833
## Traenkle.Bailer       -0.4831579 -0.5491468  -0.5491468 -0.999275945
## Traenkle.Bailer.2     -0.5380406 -0.9206771  -0.9206771 -0.712342431
## Wheeler.Smith          0.4952706  0.6127006   0.6127006  0.993623225
## meanSentenceLength     0.4549252  0.4728710   0.4728710  0.997112805
## meanWordSyllables      0.5024015  0.9645525   0.9645525  0.631045601
##                       Linsear.Write         LIW        nWS       nWS.2
## ARI                      0.98324412  0.99534363  0.9716012  0.97550799
## ARI.simple               0.98272675  0.99493616  0.9704382  0.97443531
## Bormuth                 -0.96316927 -0.97782198 -0.9362833 -0.94231463
## Bormuth.GP               0.88236447  0.88013733  0.8218265  0.82921349
## Coleman                 -0.65408730 -0.64151385 -0.7435480 -0.72971888
## Coleman.C2              -0.81398206 -0.81448329 -0.8869623 -0.87759182
## Coleman.Liau            -0.84460735 -0.84663781 -0.9108590 -0.90420615
## Coleman.Liau.grade       0.84460735  0.84663781  0.9108590  0.90420615
## Coleman.Liau.short       0.84462952  0.84666264  0.9108773  0.90422529
## Dale.Chall              -0.96173384 -0.97649153 -0.9339636 -0.94010518
## Dale.Chall.old           0.96173384  0.97649153  0.9339636  0.94010518
## Dale.Chall.PSK           0.96173384  0.97649153  0.9339636  0.94010518
## Danielson.Bryan          0.98940268  0.99817496  0.9848656  0.98751970
## Danielson.Bryan.2       -0.39525523 -0.37953455 -0.5077365 -0.49287612
## Dickes.Steiwer          -0.97750750 -0.99028645 -0.9594714 -0.96426773
## DRP                      0.96316927  0.97782198  0.9362833  0.94231463
## ELF                      0.98844709  0.99591724  0.9767864  0.97966902
## Farr.Jenkins.Paterson   -0.96273644 -0.97739468 -0.9354875 -0.94154405
## Flesch                  -0.98424870 -0.98395156 -0.9980699 -0.99747205
## Flesch.PSK               0.99144816  0.99452041  0.9961226  0.99702731
## Flesch.Kincaid           0.98967358  0.99764432  0.9808885  0.98398342
## FOG                      0.99372655  0.99741442  0.9863762  0.98925526
## FOG.PSK                  0.98018969  0.99039812  0.9608083  0.96561861
## FOG.NRI                  0.50598103  0.50416033  0.5303825  0.52805992
## FORCAST                  0.65408730  0.64151385  0.7435480  0.72971888
## FORCAST.RGL              0.65408730  0.64151385  0.7435480  0.72971888
## Fucks                    0.97725879  0.98873824  0.9560431  0.96099567
## Linsear.Write            1.00000000  0.99064913  0.9868813  0.98906653
## LIW                      0.99064913  1.00000000  0.9871894  0.98960771
## nWS                      0.98688126  0.98718935  1.0000000  0.99978564
## nWS.2                    0.98906653  0.98960771  0.9997856  1.00000000
## nWS.3                    0.99452970  0.99372252  0.9954778  0.99701030
## nWS.4                    0.99399250  0.99738312  0.9871908  0.98998119
## RIX                      0.99270597  0.99722072  0.9816466  0.98430393
## Scrabble                -0.03242714 -0.03879638 -0.0868361 -0.07743888
## SMOG                     0.99042784  0.98677649  0.9968232  0.99752895
## SMOG.C                   0.99111709  0.98723721  0.9967751  0.99753895
## SMOG.simple              0.99042784  0.98677649  0.9968232  0.99752895
## SMOG.de                  0.99042784  0.98677649  0.9968232  0.99752895
## Spache                   0.96173384  0.97649153  0.9339636  0.94010518
## Spache.old               0.96173384  0.97649153  0.9339636  0.94010518
## Strain                   0.98250555  0.99073666  0.9611723  0.96577125
## Traenkle.Bailer         -0.97898368 -0.99190213 -0.9628587 -0.96743426
## Traenkle.Bailer.2       -0.80373175 -0.80039428 -0.8744090 -0.86651511
## Wheeler.Smith            0.98844709  0.99591724  0.9767864  0.97966902
## meanSentenceLength       0.96173384  0.97649153  0.9339636  0.94010518
## meanWordSyllables        0.75466877  0.73221003  0.8262434  0.81584881
##                             nWS.3       nWS.4         RIX      Scrabble
## ARI                    0.98501356  0.99487148  0.99222014 -0.0112182516
## ARI.simple             0.98427247  0.99451326  0.99191614 -0.0093650938
## Bormuth               -0.95970981 -0.97861519 -0.97704126 -0.0333496851
## Bormuth.GP             0.85402853  0.88101179  0.89115856 -0.0158199992
## Coleman               -0.68262459 -0.63167564 -0.62921060  0.3522407942
## Coleman.C2            -0.84444185 -0.80789970 -0.79924879  0.2472067525
## Coleman.Liau          -0.87578843 -0.84167664 -0.82960560  0.1918831342
## Coleman.Liau.grade     0.87578843  0.84167664  0.82960560 -0.1918831342
## Coleman.Liau.short     0.87581059  0.84170213  0.82963012 -0.1918617971
## Dale.Chall            -0.95786413 -0.97726151 -0.97599601 -0.0354646177
## Dale.Chall.old         0.95786413  0.97726151  0.97599601  0.0354646177
## Dale.Chall.PSK         0.95786413  0.97726151  0.97599601  0.0354646177
## Danielson.Bryan        0.99235519  0.99686096  0.99500484 -0.0367962483
## Danielson.Bryan.2     -0.43696629 -0.37128519 -0.35912065  0.3211432536
## Dickes.Steiwer        -0.97697873 -0.99039744 -0.98825075 -0.0143478880
## DRP                    0.95970981  0.97861519  0.97704126  0.0333496851
## ELF                    0.98635632  0.99411156  0.99677384 -0.0411360767
## Farr.Jenkins.Paterson -0.95902099 -0.97811285 -0.97683788 -0.0335901823
## Flesch                -0.99333954 -0.98430842 -0.97797435  0.1056725636
## Flesch.PSK             0.99774824  0.99497427  0.98969704 -0.0753648675
## Flesch.Kincaid         0.99180645  0.99823466  0.99457144 -0.0300226725
## FOG                    0.99654303  0.99998483  0.99391070 -0.0227592579
## FOG.PSK                0.97909440  0.99192476  0.98864254  0.0113141021
## FOG.NRI                0.52250155  0.50861922  0.49230997 -0.1428062743
## FORCAST                0.68262459  0.63167564  0.62921060 -0.3522407942
## FORCAST.RGL            0.68262459  0.63167564  0.62921060 -0.3522407942
## Fucks                  0.97424064  0.98857134  0.98841050  0.0085232049
## Linsear.Write          0.99452970  0.99399250  0.99270597 -0.0324271351
## LIW                    0.99372252  0.99738312  0.99722072 -0.0387963848
## nWS                    0.99547782  0.98719082  0.98164655 -0.0868361035
## nWS.2                  0.99701030  0.98998119  0.98430393 -0.0774388847
## nWS.3                  1.00000000  0.99698554  0.98905195 -0.0458903170
## nWS.4                  0.99698554  1.00000000  0.99380128 -0.0242978124
## RIX                    0.98905195  0.99380128  1.00000000 -0.0444703290
## Scrabble              -0.04589032 -0.02429781 -0.04447033  1.0000000000
## SMOG                   0.99778573  0.99076681  0.98160988 -0.0511186091
## SMOG.C                 0.99797185  0.99118746  0.98237501 -0.0501480088
## SMOG.simple            0.99778573  0.99076681  0.98160988 -0.0511186091
## SMOG.de                0.99778573  0.99076681  0.98160988 -0.0511186091
## Spache                 0.95786413  0.97726151  0.97599601  0.0354646177
## Spache.old             0.95786413  0.97726151  0.97599601  0.0354646177
## Strain                 0.97853156  0.99128781  0.99114270 -0.0005155977
## Traenkle.Bailer       -0.97928651 -0.99180632 -0.98944923 -0.0023260095
## Traenkle.Bailer.2     -0.83294548 -0.79351594 -0.78537286  0.2500591399
## Wheeler.Smith          0.98635632  0.99411156  0.99677384 -0.0411360767
## meanSentenceLength     0.95786413  0.97726151  0.97599601  0.0354646177
## meanWordSyllables      0.78047895  0.73186125  0.71973074 -0.2859592235
##                              SMOG      SMOG.C SMOG.simple     SMOG.de
## ARI                    0.97393941  0.97457741  0.97393941  0.97393941
## ARI.simple             0.97295999  0.97361454  0.97295999  0.97295999
## Bormuth               -0.94303306 -0.94405248 -0.94303306 -0.94303306
## Bormuth.GP             0.82113760  0.82339465  0.82113760  0.82113760
## Coleman               -0.71333148 -0.71131513 -0.71333148 -0.71333148
## Coleman.C2            -0.86811865 -0.86636103 -0.86811865 -0.86811865
## Coleman.Liau          -0.89524825 -0.89368168 -0.89524825 -0.89524825
## Coleman.Liau.grade     0.89524825  0.89368168  0.89524825  0.89524825
## Coleman.Liau.short     0.89526894  0.89370243  0.89526894  0.89526894
## Dale.Chall            -0.94089430 -0.94194286 -0.94089430 -0.94089430
## Dale.Chall.old         0.94089430  0.94194286  0.94089430  0.94089430
## Dale.Chall.PSK         0.94089430  0.94194286  0.94089430  0.94089430
## Danielson.Bryan        0.98460337  0.98509384  0.98460337  0.98460337
## Danielson.Bryan.2     -0.47640589 -0.47358728 -0.47640589 -0.47640589
## Dickes.Steiwer        -0.96367176 -0.96447120 -0.96367176 -0.96367176
## DRP                    0.94303306  0.94405248  0.94303306  0.94303306
## ELF                    0.97747350  0.97820253  0.97747350  0.97747350
## Farr.Jenkins.Paterson -0.94224804 -0.94328383 -0.94224804 -0.94224804
## Flesch                -0.99453689 -0.99442699 -0.99453689 -0.99453689
## Flesch.PSK             0.99489006  0.99503836  0.99489006  0.99489006
## Flesch.Kincaid         0.98304183  0.98356657  0.98304183  0.98304183
## FOG                    0.99004142  0.99047862  0.99004142  0.99004142
## FOG.PSK                0.96641067  0.96720922  0.96641067  0.96641067
## FOG.NRI                0.53675653  0.53604370  0.53675653  0.53675653
## FORCAST                0.71333148  0.71131513  0.71333148  0.71333148
## FORCAST.RGL            0.71333148  0.71131513  0.71333148  0.71333148
## Fucks                  0.96035603  0.96128185  0.96035603  0.96035603
## Linsear.Write          0.99042784  0.99111709  0.99042784  0.99042784
## LIW                    0.98677649  0.98723721  0.98677649  0.98677649
## nWS                    0.99682318  0.99677506  0.99682318  0.99682318
## nWS.2                  0.99752895  0.99753895  0.99752895  0.99752895
## nWS.3                  0.99778573  0.99797185  0.99778573  0.99778573
## nWS.4                  0.99076681  0.99118746  0.99076681  0.99076681
## RIX                    0.98160988  0.98237501  0.98160988  0.98160988
## Scrabble              -0.05111861 -0.05014801 -0.05111861 -0.05111861
## SMOG                   1.00000000  0.99998563  1.00000000  1.00000000
## SMOG.C                 0.99998563  1.00000000  0.99998563  0.99998563
## SMOG.simple            1.00000000  0.99998563  1.00000000  1.00000000
## SMOG.de                1.00000000  0.99998563  1.00000000  1.00000000
## Spache                 0.94089430  0.94194286  0.94089430  0.94089430
## Spache.old             0.94089430  0.94194286  0.94089430  0.94089430
## Strain                 0.96573029  0.96664268  0.96573029  0.96573029
## Traenkle.Bailer       -0.96666632 -0.96741430 -0.96666632 -0.96666632
## Traenkle.Bailer.2     -0.85456811 -0.85299245 -0.85456811 -0.85456811
## Wheeler.Smith          0.97747350  0.97820253  0.97747350  0.97747350
## meanSentenceLength     0.94089430  0.94194286  0.94089430  0.94089430
## meanWordSyllables      0.80819548  0.80640351  0.80819548  0.80819548
##                            Spache  Spache.old        Strain
## ARI                    0.99099423  0.99099423  0.9976064239
## ARI.simple             0.99169343  0.99169343  0.9978635733
## Bormuth               -0.99996713 -0.99996713 -0.9959605646
## Bormuth.GP             0.92746308  0.92746308  0.9201367937
## Coleman               -0.47287100 -0.47287100 -0.5461292792
## Coleman.C2            -0.68300831 -0.68300831 -0.7394040240
## Coleman.Liau          -0.72238083 -0.72238083 -0.7744192348
## Coleman.Liau.grade     0.72238083  0.72238083  0.7744192348
## Coleman.Liau.short     0.72241463  0.72241463  0.7744490784
## Dale.Chall            -1.00000000 -1.00000000 -0.9954423736
## Dale.Chall.old         1.00000000  1.00000000  0.9954423736
## Dale.Chall.PSK         1.00000000  1.00000000  0.9954423736
## Danielson.Bryan        0.97829531  0.97829531  0.9916468644
## Danielson.Bryan.2     -0.18028533 -0.18028533 -0.2608961180
## Dickes.Steiwer        -0.99638999 -0.99638999 -0.9987560895
## DRP                    0.99996713  0.99996713  0.9959605646
## ELF                    0.98451149  0.98451149  0.9957646921
## Farr.Jenkins.Paterson -0.99998999 -0.99998999 -0.9958154056
## Flesch                -0.92913685 -0.92913685 -0.9579641510
## Flesch.PSK             0.95657238  0.95657238  0.9782782859
## Flesch.Kincaid         0.98488025  0.98488025  0.9958824210
## FOG                    0.97841468  0.97841468  0.9919662659
## FOG.PSK                0.99626213  0.99626213  0.9992501846
## FOG.NRI                0.45492516  0.45492516  0.4766100001
## FORCAST                0.47287100  0.47287100  0.5461292792
## FORCAST.RGL            0.47287100  0.47287100  0.5461292792
## Fucks                  0.99711281  0.99711281  0.9992788331
## Linsear.Write          0.96173384  0.96173384  0.9825055506
## LIW                    0.97649153  0.97649153  0.9907366605
## nWS                    0.93396357  0.93396357  0.9611722939
## nWS.2                  0.94010518  0.94010518  0.9657712464
## nWS.3                  0.95786413  0.95786413  0.9785315584
## nWS.4                  0.97726151  0.97726151  0.9912878102
## RIX                    0.97599601  0.97599601  0.9911426991
## Scrabble               0.03546462  0.03546462 -0.0005155977
## SMOG                   0.94089430  0.94089430  0.9657302854
## SMOG.C                 0.94194286  0.94194286  0.9666426822
## SMOG.simple            0.94089430  0.94089430  0.9657302854
## SMOG.de                0.94089430  0.94089430  0.9657302854
## Spache                 1.00000000  1.00000000  0.9954423736
## Spache.old             1.00000000  1.00000000  0.9954423736
## Strain                 0.99544237  0.99544237  1.0000000000
## Traenkle.Bailer       -0.99529866 -0.99529866 -0.9987681336
## Traenkle.Bailer.2     -0.66117918 -0.66117918 -0.7203881294
## Wheeler.Smith          0.98451149  0.98451149  0.9957646921
## meanSentenceLength     1.00000000  1.00000000  0.9954423736
## meanWordSyllables      0.57632421  0.57632421  0.6467738310
##                       Traenkle.Bailer Traenkle.Bailer.2 Wheeler.Smith
## ARI                      -0.999223850        -0.7549178    0.99594813
## ARI.simple               -0.999411499        -0.7514487    0.99583585
## Bormuth                   0.995931694         0.6663152   -0.98541527
## Bormuth.GP               -0.910151256        -0.5147186    0.89837506
## Coleman                   0.549146837         0.9206771   -0.61270055
## Coleman.C2                0.745171965         0.9574552   -0.79038825
## Coleman.Liau              0.785328440         0.9904793   -0.81342458
## Coleman.Liau.grade       -0.785328440        -0.9904793    0.81342458
## Coleman.Liau.short       -0.785358433        -0.9904734    0.81345135
## Dale.Chall                0.995298661         0.6611792   -0.98451149
## Dale.Chall.old           -0.995298661        -0.6611792    0.98451149
## Dale.Chall.PSK           -0.995298661        -0.6611792    0.98451149
## Danielson.Bryan          -0.993491902        -0.7999586    0.99535015
## Danielson.Bryan.2         0.274099704         0.8509255   -0.32598561
## Dickes.Steiwer            0.999594489         0.7195290   -0.99345440
## DRP                      -0.995931694        -0.6663152    0.98541527
## ELF                      -0.994372531        -0.7636349    1.00000000
## Farr.Jenkins.Paterson     0.995687365         0.6642605   -0.98524897
## Flesch                    0.959086871         0.8800874   -0.97375531
## Flesch.PSK               -0.979131224        -0.8419885    0.98827872
## Flesch.Kincaid           -0.996329486        -0.7757025    0.99727425
## FOG                      -0.992467872        -0.7905347    0.99443432
## FOG.PSK                  -0.999375460        -0.7187449    0.99386237
## FOG.NRI                  -0.483157912        -0.5380406    0.49527062
## FORCAST                  -0.549146837        -0.9206771    0.61270055
## FORCAST.RGL              -0.549146837        -0.9206771    0.61270055
## Fucks                    -0.999275945        -0.7123424    0.99362322
## Linsear.Write            -0.978983682        -0.8037317    0.98844709
## LIW                      -0.991902133        -0.8003943    0.99591724
## nWS                      -0.962858718        -0.8744090    0.97678636
## nWS.2                    -0.967434256        -0.8665151    0.97966902
## nWS.3                    -0.979286510        -0.8329455    0.98635632
## nWS.4                    -0.991806322        -0.7935159    0.99411156
## RIX                      -0.989449234        -0.7853729    0.99677384
## Scrabble                 -0.002326009         0.2500591   -0.04113608
## SMOG                     -0.966666322        -0.8545681    0.97747350
## SMOG.C                   -0.967414302        -0.8529924    0.97820253
## SMOG.simple              -0.966666322        -0.8545681    0.97747350
## SMOG.de                  -0.966666322        -0.8545681    0.97747350
## Spache                   -0.995298661        -0.6611792    0.98451149
## Spache.old               -0.995298661        -0.6611792    0.98451149
## Strain                   -0.998768134        -0.7203881    0.99576469
## Traenkle.Bailer           1.000000000         0.7306470   -0.99437253
## Traenkle.Bailer.2         0.730647041         1.0000000   -0.76363493
## Wheeler.Smith            -0.994372531        -0.7636349    1.00000000
## meanSentenceLength       -0.995298661        -0.6611792    0.98451149
## meanWordSyllables        -0.649467672        -0.9684616    0.69782528
##                       meanSentenceLength meanWordSyllables
## ARI                           0.99099423         0.6762172
## ARI.simple                    0.99169343         0.6724762
## Bormuth                      -0.99996713        -0.5818750
## Bormuth.GP                    0.92746308         0.4479544
## Coleman                      -0.47287100        -0.9645525
## Coleman.C2                   -0.68300831        -0.9629188
## Coleman.Liau                 -0.72238083        -0.9543023
## Coleman.Liau.grade            0.72238083         0.9543023
## Coleman.Liau.short            0.72241463         0.9542911
## Dale.Chall                   -1.00000000        -0.5763242
## Dale.Chall.old                1.00000000         0.5763242
## Dale.Chall.PSK                1.00000000         0.5763242
## Danielson.Bryan               0.97829531         0.7259320
## Danielson.Bryan.2            -0.18028533        -0.8772134
## Dickes.Steiwer               -0.99638999        -0.6393556
## DRP                           0.99996713         0.5818750
## ELF                           0.98451149         0.6978253
## Farr.Jenkins.Paterson        -0.99998999        -0.5798330
## Flesch                       -0.92913685        -0.8376402
## Flesch.PSK                    0.95657238         0.7895116
## Flesch.Kincaid                0.98488025         0.7091829
## FOG                           0.97841468         0.7282418
## FOG.PSK                       0.99626213         0.6428726
## FOG.NRI                       0.45492516         0.5024015
## FORCAST                       0.47287100         0.9645525
## FORCAST.RGL                   0.47287100         0.9645525
## Fucks                         0.99711281         0.6310456
## Linsear.Write                 0.96173384         0.7546688
## LIW                           0.97649153         0.7322100
## nWS                           0.93396357         0.8262434
## nWS.2                         0.94010518         0.8158488
## nWS.3                         0.95786413         0.7804790
## nWS.4                         0.97726151         0.7318613
## RIX                           0.97599601         0.7197307
## Scrabble                      0.03546462        -0.2859592
## SMOG                          0.94089430         0.8081955
## SMOG.C                        0.94194286         0.8064035
## SMOG.simple                   0.94089430         0.8081955
## SMOG.de                       0.94089430         0.8081955
## Spache                        1.00000000         0.5763242
## Spache.old                    1.00000000         0.5763242
## Strain                        0.99544237         0.6467738
## Traenkle.Bailer              -0.99529866        -0.6494677
## Traenkle.Bailer.2            -0.66117918        -0.9684616
## Wheeler.Smith                 0.98451149         0.6978253
## meanSentenceLength            1.00000000         0.5763242
## meanWordSyllables             0.57632421         1.0000000
tic()
boxplot(normalit(Flesch.Kincaid)~section,data=tr.df,main="Reading level by Section ")

boxplot(normalit(Flesch.Kincaid)~section,data=tr.df[tr.df$section %in% c("Section 01","Section 02","Section 03","Section 04")],main="Reading level by Section ")

toc()
## 0.09 sec elapsed
tic()
Sentiment <- get_nrc_sentiment(raw.df$raw_text)



sentiment.df <- data.frame(section_id=doc_id_text,Sentiment,stringsAsFactors = FALSE)
toc()
## 7.03 sec elapsed
tic()
#Transformation and  cleaning
td<-data.frame(t(Sentiment))
td_Rowsum <- data.frame(rowSums(td[2:length(td)])) 

names(td_Rowsum)[1] <- "count"
td_Rowsum <- cbind("sentiment" = rownames(td_Rowsum), td_Rowsum)
rownames(td_Rowsum) <- NULL
td_Plot<-td_Rowsum[1:10,]
levels(td_Plot$sentiment) <- c("Negative","Anger","Anticipation","Disgust","Fear","Joy","Sadness","Surprise","Trust","Positive" )

qplot(sentiment, data=td_Plot, weight=count, geom="bar",fill=sentiment)+
  ggtitle("Doug's book Collection Overall sentiment analysis")

for(doc_id in sort(unique(sentiment.df$section_id))) { 
  tmp.df <- sentiment.df[doc_id == sentiment.df$section_id,]
  td<-data.frame(t( tmp.df[,c(2:11)]))
  td_Rowsum <- data.frame(rowSums(td[2:length(td)])) 
  
  #Transformation and  cleaning
  
  names(td_Rowsum)[1] <- "count"
  td_Rowsum <- cbind("sentiment" = rownames(td_Rowsum), td_Rowsum)
  rownames(td_Rowsum) <- NULL
  td_Plot<-td_Rowsum[1:10,]
  levels(td_Plot$sentiment) <-  c("Negative","Anger","Anticipation","Disgust","Fear","Joy","Sadness","Surprise","Trust","Positive" )
  print(qplot(sentiment, data=td_Plot, weight=count, geom="bar",fill=sentiment)+
  ggtitle(paste0("Doug's book Collection Overall sentiment analysis for ",doc_id)))
  

}

toc()
## 2.66 sec elapsed
udmodel_english <- udpipe_load_model(file = '../udpipe/english-ud-2.0-170801.udpipe')

tic()
ann.raw <- udpipe_annotate(udmodel_english,text,doc_id=paste(filename,doc_id_text,sep=" "))

ann.df <- data.frame(ann.raw)
toc()
## 484.06 sec elapsed
demo.raw <- udpipe_annotate(udmodel_english,c("What is the average temperature?",
                                          "Can you average those numbers?",
                                          "What does the average say? ")
                        
)
demo.df <- data.frame(demo.raw)

#
# Same word, different part of speech. 
#

demo.df[demo.df$token=="average",c(4,6,8)]
demo.df[c(1:2),]
x <- c("PROPN", "SCONJ", "ADJ", "NOUN", "VERB", "INTJ", "DET", "VERB", 
       "PROPN", "AUX", "NUM", "NUM", "X", "SCONJ", "PRON", "PUNCT", "ADP", 
       "X", "PUNCT", "AUX", "PROPN", "ADP", "X", "PROPN", "ADP", "DET", 
       "CCONJ", "INTJ", "NOUN", "PROPN")
as_phrasemachine(x)
##  [1] "N" "C" "A" "N" "V" "O" "D" "V" "N" "V" "A" "A" "O" "C" "N" "O" "P"
## [18] "O" "O" "V" "N" "P" "O" "N" "P" "D" "C" "O" "N" "N"
tic()

tmp.df <- ann.df


tmp.df$phrase_tag <- as_phrasemachine(tmp.df$upos,
                                      type = "upos")

phrases <- keywords_phrases(x = tmp.df$phrase_tag,
                            term = tmp.df$token, 
                            pattern = "(A|N)*N(P+D*(A|N)*N)*", 
                            is_regex = TRUE,
                            detailed = FALSE)

phrases <- subset(phrases, ngram > 1 & freq > 3)

phrases$key <- factor(phrases$keyword,
                      levels = rev(phrases$keyword))
print(barchart(key ~ freq, 
               data = head(phrases[order(phrases$freq,decreasing = TRUE),],20), 
               col = "green", 
               main = "Keywords - simple noun phrases ", 
               xlab = "Frequency keywords"))

for(doc in unique(ann.df$doc_id)) { 
  tmp.df <- ann.df[ann.df$doc_id==doc,]
  tmp.df$phrase_tag <- as_phrasemachine(tmp.df$upos, type = "upos")
  phrases <- keywords_phrases(x = tmp.df$phrase_tag, 
                              term = tmp.df$token, 
                              pattern = "(A|N)*N(P+D*(A|N)*N)*", 
                              is_regex = TRUE, 
                              detailed = FALSE)
  phrases <- subset(phrases, ngram > 1 & freq > 3)
  phrases$key <- factor(phrases$keyword, levels = rev(phrases$keyword))
  print(barchart(key ~ freq, 
                 data = head(phrases[order(phrases$freq,decreasing = TRUE),],20), 
                 col = "Green", 
                 main = paste0("Keywords - simple noun phrases ",doc), 
                 xlab = "Frequency"))
}

toc()
## 47.68 sec elapsed
tmp.df <- ann.df

rake_keywords <- keywords_rake(tmp.df, 
                               term = "lemma", 
                               group = "doc_id",
                               relevant = tmp.df$upos %in% c("NOUN", "ADJ")
                               )
rake_keywords$key <- factor(rake_keywords$keyword, levels = rev(rake_keywords$keyword))

rake_keywords <- subset(rake_keywords, ngram > 1 & freq > 3)
print(barchart(key ~ rake, 
               data = head(rake_keywords[order(rake_keywords$freq,decreasing = TRUE),], 20), 
               col = "blue", 
               main = "Keywords identified by RAKE ",
               xlab = "Rake"))

for(doc in unique(ann.df$doc_id)) { 
  tmp.df <- ann.df[ann.df$doc_id==doc,]
  rake_keywords <- keywords_rake(tmp.df , 
                                 term = "lemma", 
                                 group = "doc_id", 
                                 relevant = tmp.df$upos %in% c("NOUN", "ADJ")
                                 )
  rake_keywords$key <- factor(rake_keywords$keyword, levels = rev(rake_keywords$keyword))
  rake_keywords <- subset(rake_keywords,ngram > 1 & freq > 3)
  print(barchart(key ~ rake, 
                 data = head(rake_keywords[order(rake_keywords$freq,decreasing = TRUE),], 20), 
                 col = "blue", 
                 main = paste0("Keywords identified by RAKE ",doc), 
                 xlab = "Rake Keyword"))
}