## Loading required package: splines

## Loading required package: RcmdrMisc

## Loading required package: car

## Loading required package: carData

## Loading required package: sandwich

## Loading required package: effects

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

## The Commander GUI is launched only in interactive sessions

## 
## Attaching package: 'Rcmdr'

## The following object is masked from 'package:base':
## 
##     errorCondition

## corrplot 0.92 loaded

1 Structure of Data Analysis

DiagrammeR::grViz("digraph {
  graph [layout = dot, rankdir = TB]
  
  node [shape = rectangle]        
  rec1 [label = '1. Introduction to the Dataset']
  rec2 [label = '2. Exploratory Analysis']
  rec3 [label = '3. Predictive Analysis']
  rec4 [label = '4. Recommendations/Conclusions']
  
  # edge definitions with the node IDs
  rec1 -> rec2 -> rec3 -> rec4
  }",
  height = 500)

2 Sample dataset

setwd("C:/ssb")
review <- read.table("data/reviewData.csv",
   header=TRUE, stringsAsFactors=TRUE, sep=",", na.strings="NA", dec=".", 
  strip.white=TRUE)

dat <- head(review)
kab <- knitr::kable(dat, caption = "Sample Data",
                         booktabs = T, label = "")
kable_classic_2(kab, full_width = F)

(#tab:)Sample Data
emotion	perfunct	financial	water	Authentic	function.	affect	cogproc	percept
21	9	72	113	14.43	51.22	7.32	14.63	7.32
21	7	71	96	51.03	51.92	9.62	9.62	7.69
30	9	64	101	9.72	50.94	15.09	5.66	5.66
23	10	66	89	1.00	46.51	16.28	4.65	4.65
22	17	66	80	1.00	59.57	14.89	2.13	10.64
16	105	23	31	58.07	33.33	11.11	11.11	0.00

3 Summary of the dataset

numSummary(review[,c("affect", "Authentic", "cogproc", "emotion", "financial", "function.", "percept", "perfunct", "water"), drop=FALSE], statistics=c("mean", "sd", "IQR", "quantiles"), quantiles=c(0,.25,.5,.75,1))

##                mean        sd      IQR 0%     25%    50%      75% 100%   n
## affect    16.764621 17.424472  12.9400  0  6.2500 12.355  19.1900  100 264
## Authentic 29.856970 33.503624  52.0450  1  1.0475 14.430  53.0925   99 264
## cogproc    9.664280  9.362441   9.9575  0  3.4500  8.135  13.4075   50 264
## emotion   50.375000 58.025885  91.5000  3 10.0000 20.000 101.5000  175 264
## financial 58.041667 44.723083  58.0000  1 11.0000 54.000  69.0000  164 264
## function. 46.773674 15.663334  14.4150  0 41.6300 50.000  56.0450   80 264
## percept    4.749811  6.553425   6.5225  0  0.0000  2.900   6.5225   50 264
## perfunct  51.787879 59.996011 114.0000  2  9.0000 15.000 123.0000  176 264
## water     57.053030 34.752643  45.2500 16 31.0000 45.000  76.2500  134 264

C <- cor(review[,c("affect","Authentic","cogproc","emotion","financial","function.","percept","perfunct","water")], use="complete")
emphasize.strong.cells(which(C > 0.4 & C != 1 | C < -0.4, arr.ind = TRUE))
pander(C)

Table continues below
	affect	Authentic	cogproc	emotion	financial
affect	1	-0.2387	-0.03391	-0.3	0.0116
Authentic	-0.2387	1	0.2718	0.215	-0.2006
cogproc	-0.03391	0.2718	1	0.08444	-0.1283
emotion	-0.3	0.215	0.08444	1	-0.7195
financial	0.0116	-0.2006	-0.1283	-0.7195	1
function.	-0.5653	0.2503	-0.02446	0.2287	-0.04511
percept	0.02271	-0.08356	0.1752	-0.02178	0.1559
perfunct	0.4696	-0.1318	0.09031	-0.502	0.1069
water	-0.09351	0.1074	-0.06307	-0.07343	-0.2063

	function.	percept	perfunct	water
affect	-0.5653	0.02271	0.4696	-0.09351
Authentic	0.2503	-0.08356	-0.1318	0.1074
cogproc	-0.02446	0.1752	0.09031	-0.06307
emotion	0.2287	-0.02178	-0.502	-0.07343
financial	-0.04511	0.1559	0.1069	-0.2063
function.	1	-0.1175	-0.3493	0.1204
percept	-0.1175	1	-0.1403	-0.03554
perfunct	-0.3493	-0.1403	1	-0.3947
water	0.1204	-0.03554	-0.3947	1

corrplot(C, order="FPC", method = "ellipse")

4 Test of Normality

normalityTest(~emotion, test="shapiro.test", data=review)

## 
##  Shapiro-Wilk normality test
## 
## data:  emotion
## W = 0.70599, p-value < 2.2e-16

normalityTest(~perfunct, test="shapiro.test", data=review)

## 
##  Shapiro-Wilk normality test
## 
## data:  perfunct
## W = 0.72373, p-value < 2.2e-16

normalityTest(~financial, test="shapiro.test", data=review)

## 
##  Shapiro-Wilk normality test
## 
## data:  financial
## W = 0.89498, p-value = 1.409e-12

normalityTest(~water, test="shapiro.test", data=review)

## 
##  Shapiro-Wilk normality test
## 
## data:  water
## W = 0.84613, p-value = 1.669e-15

5 Transformation to Normality

summary(powerTransform(emotion ~ 1, data=review, family="bcPower"))

## bcPower Transformation to Normality 
##    Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1   -0.2958       -0.33      -0.4196      -0.1719
## 
## Likelihood ratio test that transformation parameter is equal to 0
##  (log transformation)
##                            LRT df       pval
## LR test, lambda = (0) 21.86336  1 2.9277e-06
## 
## Likelihood ratio test that no transformation is needed
##                            LRT df       pval
## LR test, lambda = (1) 376.4426  1 < 2.22e-16

5.1 Linear regression models

fit <- lm(emotion ~ Authentic+function.+affect+cogproc+percept, review)

# Show the linear model
equatiomatic::extract_eq(fit)

\[ \operatorname{emotion} = \alpha + \beta_{1}(\operatorname{Authentic}) + \beta_{2}(\operatorname{function.}) + \beta_{3}(\operatorname{affect}) + \beta_{4}(\operatorname{cogproc}) + \beta_{5}(\operatorname{percept}) + \epsilon \]

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

\[ \operatorname{\widehat{emotion}} = 42.46 + 0.23(\operatorname{Authentic}) + 0.25(\operatorname{function.}) - 0.76(\operatorname{affect}) + 0.27(\operatorname{cogproc}) - 0.05(\operatorname{percept}) \]

fit <- lm(perfunct ~ Authentic+function.+affect+cogproc+percept, review)

# Show the linear model
equatiomatic::extract_eq(fit)

\[ \operatorname{perfunct} = \alpha + \beta_{1}(\operatorname{Authentic}) + \beta_{2}(\operatorname{function.}) + \beta_{3}(\operatorname{affect}) + \beta_{4}(\operatorname{cogproc}) + \beta_{5}(\operatorname{percept}) + \epsilon \]

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

\[ \operatorname{\widehat{perfunct}} = 55.71 - 0.11(\operatorname{Authentic}) - 0.51(\operatorname{function.}) + 1.34(\operatorname{affect}) + 0.97(\operatorname{cogproc}) - 1.8(\operatorname{percept}) \]

fit <- lm(financial ~ Authentic+function.+affect+cogproc+percept, review)

# Show the linear model
equatiomatic::extract_eq(fit)

\[ \operatorname{financial} = \alpha + \beta_{1}(\operatorname{Authentic}) + \beta_{2}(\operatorname{function.}) + \beta_{3}(\operatorname{affect}) + \beta_{4}(\operatorname{cogproc}) + \beta_{5}(\operatorname{percept}) + \epsilon \]

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

\[ \operatorname{\widehat{financial}} = 67.86 - 0.22(\operatorname{Authentic}) - 0.03(\operatorname{function.}) - 0.11(\operatorname{affect}) - 0.54(\operatorname{cogproc}) + 1.1(\operatorname{percept}) \]

fit <- lm(water ~ Authentic+function.+affect+cogproc+percept, review)

# Show the linear model
equatiomatic::extract_eq(fit)

\[ \operatorname{water} = \alpha + \beta_{1}(\operatorname{Authentic}) + \beta_{2}(\operatorname{function.}) + \beta_{3}(\operatorname{affect}) + \beta_{4}(\operatorname{cogproc}) + \beta_{5}(\operatorname{percept}) + \epsilon \]

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

\[ \operatorname{\widehat{water}} = 50.21 + 0.11(\operatorname{Authentic}) + 0.17(\operatorname{function.}) - 0.06(\operatorname{affect}) - 0.34(\operatorname{cogproc}) - 0.01(\operatorname{percept}) \]

6 Logistic regressions

bEmotion <- ifelse(review$emotion > median(review$emotion), 1, 0)
bPerfunct <- ifelse(review$perfunct > median(review$perfunct), 1, 0)
bFinancial <- ifelse(review$financial > median(review$financial), 1, 0)
bWater <- ifelse(review$water > median(review$water), 1, 0)
binData <- data.frame(bEmotion, bPerfunct, bFinancial, bWater)
review <- cbind(review, binData)

fit <- glm(bEmotion ~ Authentic+function.+affect+cogproc+percept, data=review, family = binomial)

# Show the linear model
equatiomatic::extract_eq(fit)

\[ \log\left[ \frac { P( \operatorname{bEmotion} = \operatorname{1} ) }{ 1 - P( \operatorname{bEmotion} = \operatorname{1} ) } \right] = \alpha + \beta_{1}(\operatorname{Authentic}) + \beta_{2}(\operatorname{function.}) + \beta_{3}(\operatorname{affect}) + \beta_{4}(\operatorname{cogproc}) + \beta_{5}(\operatorname{percept}) \]

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

\[ \log\left[ \frac { \widehat{P( \operatorname{bEmotion} = \operatorname{1} )} }{ 1 - \widehat{P( \operatorname{bEmotion} = \operatorname{1} )} } \right] = -0.25 + 0.01(\operatorname{Authentic}) + 0.01(\operatorname{function.}) - 0.04(\operatorname{affect}) - 0.02(\operatorname{cogproc}) + 0.03(\operatorname{percept}) \]

summary(fit)

## 
## Call:
## glm(formula = bEmotion ~ Authentic + function. + affect + cogproc + 
##     percept, family = binomial, data = review)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9143  -1.1152  -0.1798   1.0561   2.0233  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -0.25224    0.64030  -0.394  0.69362   
## Authentic    0.01174    0.00442   2.655  0.00792 **
## function.    0.01140    0.01069   1.066  0.28622   
## affect      -0.03877    0.01224  -3.166  0.00154 **
## cogproc     -0.02025    0.01616  -1.253  0.21008   
## percept      0.02902    0.02132   1.361  0.17356   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 365.97  on 263  degrees of freedom
## Residual deviance: 327.97  on 258  degrees of freedom
## AIC: 339.97
## 
## Number of Fisher Scoring iterations: 4

# create a model
fit <- glm(bPerfunct ~ Authentic+function.+affect+cogproc+percept, data=review, family = binomial)

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

\[ \log\left[ \frac { \widehat{P( \operatorname{bPerfunct} = \operatorname{1} )} }{ 1 - \widehat{P( \operatorname{bPerfunct} = \operatorname{1} )} } \right] = 0.27 - 0.01(\operatorname{Authentic}) - 0.02(\operatorname{function.}) + 0.05(\operatorname{affect}) + 0.02(\operatorname{cogproc}) - 0.02(\operatorname{percept}) \]

summary(fit)

## 
## Call:
## glm(formula = bPerfunct ~ Authentic + function. + affect + cogproc + 
##     percept, family = binomial, data = review)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1415  -0.9909  -0.6254   1.1161   2.0514  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.274561   0.649130   0.423 0.672319    
## Authentic   -0.012049   0.004556  -2.645 0.008173 ** 
## function.   -0.016014   0.010897  -1.469 0.141698    
## affect       0.045113   0.012897   3.498 0.000469 ***
## cogproc      0.020946   0.016556   1.265 0.205821    
## percept     -0.023769   0.022031  -1.079 0.280647    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 365.74  on 263  degrees of freedom
## Residual deviance: 319.06  on 258  degrees of freedom
## AIC: 331.06
## 
## Number of Fisher Scoring iterations: 5

# create a model
fit <- glm(bFinancial ~ Authentic+function.+affect+cogproc+percept, data=review, family = binomial)

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

\[ \log\left[ \frac { \widehat{P( \operatorname{bFinancial} = \operatorname{1} )} }{ 1 - \widehat{P( \operatorname{bFinancial} = \operatorname{1} )} } \right] = 0.7 - 0.01(\operatorname{Authentic}) + 0(\operatorname{function.}) - 0.01(\operatorname{affect}) - 0.02(\operatorname{cogproc}) + 0.02(\operatorname{percept}) \]

summary(fit)

## 
## Call:
## glm(formula = bFinancial ~ Authentic + function. + affect + cogproc + 
##     percept, family = binomial, data = review)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5394  -1.1407  -0.7752   1.1100   2.0611  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  0.699052   0.618185   1.131   0.2581  
## Authentic   -0.007556   0.004142  -1.824   0.0681 .
## function.   -0.003408   0.010062  -0.339   0.7348  
## affect      -0.013285   0.009102  -1.460   0.1444  
## cogproc     -0.022264   0.015062  -1.478   0.1394  
## percept      0.018453   0.020188   0.914   0.3607  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 365.92  on 263  degrees of freedom
## Residual deviance: 356.09  on 258  degrees of freedom
## AIC: 368.09
## 
## Number of Fisher Scoring iterations: 4

# create a model
fit <- glm(bWater ~ Authentic+function.+affect+cogproc+percept, data=review, family = binomial)

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

\[ \log\left[ \frac { \widehat{P( \operatorname{bWater} = \operatorname{1} )} }{ 1 - \widehat{P( \operatorname{bWater} = \operatorname{1} )} } \right] = -0.59 + 0.01(\operatorname{Authentic}) + 0(\operatorname{function.}) + 0(\operatorname{affect}) + 0.02(\operatorname{cogproc}) + 0.03(\operatorname{percept}) \]

summary(fit)

## 
## Call:
## glm(formula = bWater ~ Authentic + function. + affect + cogproc + 
##     percept, family = binomial, data = review)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6775  -1.1491  -0.9432   1.1738   1.4301  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -5.886e-01  6.100e-01  -0.965   0.3345  
## Authentic    6.799e-03  4.130e-03   1.646   0.0997 .
## function.    9.944e-05  1.000e-02   0.010   0.9921  
## affect       2.745e-03  8.918e-03   0.308   0.7583  
## cogproc      1.909e-02  1.469e-02   1.299   0.1940  
## percept      2.623e-02  2.129e-02   1.232   0.2181  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 365.92  on 263  degrees of freedom
## Residual deviance: 358.02  on 258  degrees of freedom
## AIC: 370.02
## 
## Number of Fisher Scoring iterations: 4

7 Appendix: Extracting all R codes used in this report

library(bookdown)
library(DiagrammeR)
library(equatiomatic)
library(kableExtra)
library(xtable)
library(Rcmdr)
library(pander)
library(corrplot)

DiagrammeR::grViz("digraph {
  graph [layout = dot, rankdir = TB]
  
  node [shape = rectangle]        
  rec1 [label = '1. Introduction to the Dataset']
  rec2 [label = '2. Exploratory Analysis']
  rec3 [label = '3. Predictive Analysis']
  rec4 [label = '4. Recommendations/Conclusions']
  
  # edge definitions with the node IDs
  rec1 -> rec2 -> rec3 -> rec4
  }",
  height = 500)
setwd("C:/ssb")
review <- read.table("data/reviewData.csv",
   header=TRUE, stringsAsFactors=TRUE, sep=",", na.strings="NA", dec=".", 
  strip.white=TRUE)

dat <- head(review)
kab <- knitr::kable(dat, caption = "Sample Data",
                         booktabs = T, label = "")
kable_classic_2(kab, full_width = F)
numSummary(review[,c("affect", "Authentic", "cogproc", "emotion", "financial", "function.", "percept", "perfunct", "water"), drop=FALSE], statistics=c("mean", "sd", "IQR", "quantiles"), quantiles=c(0,.25,.5,.75,1))
C <- cor(review[,c("affect","Authentic","cogproc","emotion","financial","function.","percept","perfunct","water")], use="complete")
emphasize.strong.cells(which(C > 0.4 & C != 1 | C < -0.4, arr.ind = TRUE))
pander(C)
corrplot(C, order="FPC", method = "ellipse")

normalityTest(~emotion, test="shapiro.test", data=review)
normalityTest(~perfunct, test="shapiro.test", data=review)
normalityTest(~financial, test="shapiro.test", data=review)
normalityTest(~water, test="shapiro.test", data=review)
summary(powerTransform(emotion ~ 1, data=review, family="bcPower"))

fit <- lm(emotion ~ Authentic+function.+affect+cogproc+percept, review)

# Show the linear model
equatiomatic::extract_eq(fit)

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

fit <- lm(perfunct ~ Authentic+function.+affect+cogproc+percept, review)

# Show the linear model
equatiomatic::extract_eq(fit)

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

fit <- lm(financial ~ Authentic+function.+affect+cogproc+percept, review)

# Show the linear model
equatiomatic::extract_eq(fit)

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

fit <- lm(water ~ Authentic+function.+affect+cogproc+percept, review)

# Show the linear model
equatiomatic::extract_eq(fit)

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

bEmotion <- ifelse(review$emotion > median(review$emotion), 1, 0)
bPerfunct <- ifelse(review$perfunct > median(review$perfunct), 1, 0)
bFinancial <- ifelse(review$financial > median(review$financial), 1, 0)
bWater <- ifelse(review$water > median(review$water), 1, 0)
binData <- data.frame(bEmotion, bPerfunct, bFinancial, bWater)
review <- cbind(review, binData)

fit <- glm(bEmotion ~ Authentic+function.+affect+cogproc+percept, data=review, family = binomial)

# Show the linear model
equatiomatic::extract_eq(fit)

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

summary(fit)

# create a model
fit <- glm(bPerfunct ~ Authentic+function.+affect+cogproc+percept, data=review, family = binomial)

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

summary(fit)

# create a model
fit <- glm(bFinancial ~ Authentic+function.+affect+cogproc+percept, data=review, family = binomial)

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

summary(fit)

# create a model
fit <- glm(bWater ~ Authentic+function.+affect+cogproc+percept, data=review, family = binomial)

# display the actual coefficients
equatiomatic::extract_eq(fit, use_coefs = TRUE)

summary(fit)

U of Universe, ssb@universe ↩︎

Statistical Models for Identifying Fake Reviews

S.S.B.¹

1 Structure of Data Analysis

2 Sample dataset

3 Summary of the dataset

4 Test of Normality

5 Transformation to Normality

5.1 Linear regression models

6 Logistic regressions

7 Appendix: Extracting all R codes used in this report

Statistical Models for Identifying Fake Reviews

S.S.B.1

1 Structure of Data Analysis

2 Sample dataset

3 Summary of the dataset

4 Test of Normality

5 Transformation to Normality

5.1 Linear regression models

6 Logistic regressions

7 Appendix: Extracting all R codes used in this report

S.S.B.¹