Import the data from the Dutch Lexicon Project DLP_words.csv. All materials are from here: http://crr.ugent.be/programs-data/lexicon-projects
Variables we are going to use: - rt: Response Latency to the Lexical Decision Task - subtlex.frequency: The frequency of the word from the Dutch Subtitle Project. - length: Length of the word. - POS: part of speech. - bigram.freq: Summed frequency of the bigrams in the word (the sum of each two-letter combination frequency).
library(readr)
DLP_words <- read_csv("/Users/katie/R Data/ANLY 540 Human/DLP_words.csv")
## Parsed with column specification:
## cols(
## spelling = col_character(),
## lexicality = col_character(),
## rt = col_double(),
## subtlex.frequency = col_double(),
## length = col_double(),
## POS = col_character(),
## bigram.freq = col_double()
## )
Load all the libraries or functions that you will use to for the rest of the assignment. It is helpful to define your libraries and functions at the top of a report, so that others can know what they need for the report to compile correctly.
library(Rling)
library(modeest)
#install.packages("textdata")
library(textdata)
#install.packages("tidytext")
library(knitr)
library(boot)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:boot':
##
## logit
Update the part of speech variable so that the Nouns are the comparison category. Here’s what the labels mean:
ADJ - Adjective N - Noun WW - Verbs
table(DLP_words$POS)
##
## ADJ N WW
## 1397 7591 3038
DLP_words$POS <- factor(DLP_words$POS, levels = c("N", "ADJ", "WW"), labels = c("Noun", "Adjective", "WW"))
Since we are using frequencies, we should consider the non-normality of frequency. - Include a histogram of the original subtlex.frequency column. - Log-transform the subtlex.frequency column. - Include a histogram of bigram.freq - note that it does not appear extremely skewed.
hist(DLP_words$subtlex.frequency, breaks = 100)
DLP_words$sublog <- log(DLP_words$subtlex.frequency)
hist(DLP_words$sublog)
See if you can predict response latencies (DV) with the following IVs: subtitle frequency, length, POS, and bigram frequency.
Model1 = lm(rt ~ length + sublog + POS, data = DLP_words)
summary(Model1)
##
## Call:
## lm(formula = rt ~ length + sublog + POS, data = DLP_words)
##
## Residuals:
## Min 1Q Median 3Q Max
## -350.05 -33.88 -5.93 27.12 613.42
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 711.4741 2.5243 281.852 < 2e-16 ***
## length -0.8556 0.3163 -2.705 0.00684 **
## sublog -18.0412 0.2257 -79.950 < 2e-16 ***
## POSAdjective 7.8240 1.4883 5.257 1.49e-07 ***
## POSWW 13.9057 1.1068 12.564 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.84 on 12010 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.3678, Adjusted R-squared: 0.3676
## F-statistic: 1747 on 4 and 12010 DF, p-value: < 2.2e-16
summary(Model1$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -350.052 -33.884 -5.926 0.000 27.116 613.416
All the coefficients are significant since they p < 0.05
Length is negative predictor: More the length of words, less is the response latency Frequency is negative predictor: less frequent the occurance of words, more reponse latency Adjective and noun have same response latency WW and Noun have same response latency
t1= summary(Model1)$coefficients[-1 , 3]
t1
## length sublog POSAdjective POSWW
## -2.704915 -79.950031 5.257095 12.563514
pr = t1/ sqrt(t1^2 + Model1$df.residual)
pr^2
## length sublog POSAdjective POSWW
## 0.0006088353 0.3473538103 0.0022958869 0.0129720523
tapply(DLP_words$rt, DLP_words$POS,mean)
## Noun Adjective WW
## NA NA 633.5708
summary(Model1)$fstatistic
## value numdf dendf
## 1746.652 4.000 12010.000
summary(Model1)$r.squared
## [1] 0.3677818
Is the overall model statistically significant? Model is some what statistically signicant. its not highly significant
What is the practical importance of the overall model?
Create an influence plot of the model using the car library. - Which data points appear to have the most influence on the model? shown in the table below: 358, 4184, 7620, 7799, 8552 have the most influence on model as outliers.
influencePlot(Model1)
## StudRes Hat CookD
## 358 7.3698893 0.0004348151 4.704589e-03
## 4184 1.6412360 0.0019784470 1.067813e-03
## 7620 0.2023218 0.0022553732 1.850755e-05
## 7799 -6.9010422 0.0008339810 7.919452e-03
## 8552 12.1396066 0.0002738927 7.977702e-03
Do we have additivity in our model? Show that the correlations between predictors is less than .9. Show the VIF values.
summary(Model1, correlation = T)$correlation[ , -1]
## length sublog POSAdjective POSWW
## (Intercept) -0.910653048 -0.61616282 -0.062733426 0.001433948
## length 1.000000000 0.32051163 0.005673954 -0.081889081
## sublog 0.320511627 1.00000000 -0.090013432 -0.161531364
## POSAdjective 0.005673954 -0.09001343 1.000000000 0.220361456
## POSWW -0.081889081 -0.16153136 0.220361456 1.000000000
vif(Model1)
## GVIF Df GVIF^(1/(2*Df))
## length 1.117845 1 1.057282
## sublog 1.143072 1 1.069145
## POS 1.033186 2 1.008195
Is the model linear? Yes, model is linear. - Include a plot and interpret the output.
plot(Model1, which = 2)
Are the errors normally distributed? Yes - Include a plot and interpret the output.
hist(scale(residuals(Model1)))
Do the errors meet the assumptions of homoscedasticity and homogeneity? yes. heteroscedasticity exists in this model. - Include a plot and interpret the output (either plot option).
plot(Model1, which = 1)
{plot(scale(residuals(Model1)), scale(Model1$fitted.values))
abline(v = 0, h = 0)}
Use the function provided from class (included below) and the boot library to bootstrap the model you created 1000 times. - Include the estimates of the coefficients from the bootstrapping. - Include the confidence intervals for at least one of the predictors (not the intercept). - Do our estimates appear stable, given the bootstrapping results?
Use the following to randomly sample 500 rows of data - generally, you have to have more bootstraps than rows of data, so this code will speed up your assignment. In the boot function use: data = DF[sample(1:nrow(DF), 500, replace=FALSE),] for the data argument changing DF to the name of your data frame.
bootcoef = function(formula, data, indices){
d = data[indices, ] #randomize the data by row
model = lm(formula, data = d) #run our model
return(coef(model)) #give back coefficients
}
Model1.boot = boot(formula = rt ~ length + sublog + POS,
data = DLP_words[sample(1:nrow(DLP_words), 500, replace=FALSE),],
statistic = bootcoef,
R = 1000)
Model1.boot
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = DLP_words[sample(1:nrow(DLP_words), 500, replace = FALSE),
## ], statistic = bootcoef, R = 1000, formula = rt ~ length +
## sublog + POS)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 695.7750073 0.58442670 13.239183
## t2* 0.7596661 -0.08911837 1.710815
## t3* -17.7322822 -0.01293772 0.944216
## t4* 11.7314731 -0.25821547 7.909704
## t5* 16.0552884 0.18058836 4.787369
boot.ci(Model1.boot, index = 3)
## Warning in boot.ci(Model1.boot, index = 3): bootstrap variances needed for
## studentized intervals
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = Model1.boot, index = 3)
##
## Intervals :
## Level Normal Basic
## 95% (-19.57, -15.87 ) (-19.59, -15.76 )
##
## Level Percentile BCa
## 95% (-19.70, -15.88 ) (-19.71, -15.88 )
## Calculations and Intervals on Original Scale