An online version of the Notebook is available at http://rpubs.com/jvervaart/assignment5.
library(rmarkdown)
library(knitr)
library(lsr)
library(stats)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
options(scipen=999) # restricting the use of the scientific notation
Loading and inspecting the data (PU6-exercise-data.csv).
memory <- read.csv('C:/Users/jesse/Documents/PM DSS/Statistics/PU6-exercise-data.csv', header = TRUE)
str(memory)
## 'data.frame': 100 obs. of 3 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 21 61 20 24 54 53 56 57 38 43 ...
## $ memory_score: num 71.1 67.7 65.3 81.2 80.7 ...
summary(memory)
## id age memory_score
## Min. : 1.00 Min. :18.00 Min. :52.36
## 1st Qu.: 25.75 1st Qu.:27.75 1st Qu.:66.47
## Median : 50.50 Median :38.50 Median :73.26
## Mean : 50.50 Mean :39.80 Mean :73.27
## 3rd Qu.: 75.25 3rd Qu.:53.00 3rd Qu.:81.21
## Max. :100.00 Max. :65.00 Max. :90.48
head(memory, n = 10)
First dividing the age groups:
age_spread <- quantile(memory$age, prob = c(.33, .66))
memory$age_grouped <- ifelse(memory$age <= age_spread['33%'], 'Young', ifelse(memory$age <= age_spread['66%'], 'Middle.aged', 'Old'))
memory$age_grouped <- factor(memory$age_grouped)
str(memory) # Was turned into three levels
## 'data.frame': 100 obs. of 4 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : int 21 61 20 24 54 53 56 57 38 43 ...
## $ memory_score: num 71.1 67.7 65.3 81.2 80.7 ...
## $ age_grouped : Factor w/ 3 levels "Middle.aged",..: 3 2 3 3 2 2 2 2 1 1 ...
Then, creating the ANOVA model:
memory_anova <- aov(data = memory, formula = memory_score ~ age_grouped)
Fetching residuals to create QQ-Plot:
memory_anova_residuals <- residuals(memory_anova)
Creating QQ-Plot:
qqnorm(memory_anova_residuals)
qqline(memory_anova_residuals)
After creating ANOVA model, we can look at the ANOVA test to look at the effect size:
summary(memory_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## age_grouped 2 328 163.76 1.697 0.189
## Residuals 97 9363 96.52
etaSquared(memory_anova)
## eta.sq eta.sq.part
## age_grouped 0.03379913 0.03379913
Visualising the ANOVA test results:
memory_age_split <- split(memory$memory_score, memory$age_grouped)
memory_means <- c(mean(memory_age_split$Young), mean(memory_age_split$Middle.aged), mean(memory_age_split$Old))
memory_labels <- c('Young', 'Middle-aged', 'Old')
barplot(memory_means,
ylim = c(0, 100),
main = 'Memory scores',
ylab = 'Memory score',
xlab = 'Age',
names.arg = memory_labels,
col = rainbow(6)) # I experimented with rainbow() values until I found something that was pleasing to the eye
Creating variables to use in APA reporting:
n.young <- length(memory_age_split$Young)
n.middle <- length(memory_age_split$Middle.aged)
n.old <- length(memory_age_split$Old)
memory_anova_sm <- summary(memory_anova)
memory.anova.effect <- etaSquared(memory_anova)
memory.anova.f <- memory_anova_sm[[1]][["Pr(>F)"]]
Reporting of findings
The data set contains 36 younger subjects (M = 70.8662209), 30 middle-aged subjects (M = 74.8336047) and 34 older subjects (M = 74.8336047). An ANOVA test was conducted to evaluate the hypothesis that memory scores between age groups are equal. The results of this ANOVA test suggested that memory scores between age groups differ significantly (N2 = 0.0337991, f = 0.0337991).
Loading and inspecting the data (Statistic_2017_PracticeUnit6_ANOVA_Survey Exercises.csv).
survey <- read.csv('C:/Users/jesse/Documents/PM DSS/Statistics/Statistic_2017_PracticeUnit6_ANOVA_Survey Exercises.csv', header = TRUE)
head(survey, n = 10)
Including:
Only include prerequisites:
source_family <- subset(survey, survey$source == "FAMILY")
source_family$source <- revalue(source_family$source, c('FAMILY' = 'Family')) # Changing way value is represented in data set, to remove '/' in values
source_health <- subset(survey, survey$source == "HEALTH/ILLNESS")
source_health$source <- revalue(source_health$source, c('HEALTH/ILLNESS' = 'Health')) # Changing way value is represented in data set, to remove '/' in values
source_money <- subset(survey, survey$source == "MONEY/FINANCES")
source_money$source <- revalue(source_money$source, c('MONEY/FINANCES' = 'Money')) # Changing way value is represented in data set, to remove '/' in values
rmarkdown::paged_table(droplevels(source_family))
rmarkdown::paged_table(droplevels(source_health))
rmarkdown::paged_table(droplevels(source_money))
survey_includes <- rbind(source_family, source_health, source_money)
survey_includes$source <- factor(survey_includes$source)
rmarkdown::paged_table(survey_includes)
Calculating means:
# Reverse-worded (as per the assignment description) so 6 - 4x pss* variable
survey_includes$pss4_r <- 6 - survey_includes$pss4
survey_includes$pss5_r <- 6 - survey_includes$pss5
survey_includes$pss7_r <- 6 - survey_includes$pss7
survey_includes$pss8_r <- 6 - survey_includes$pss8
survey_includes$mean_stress <- ((survey_includes$pss1 + survey_includes$pss2 + survey_includes$pss3 + survey_includes$pss4_r + survey_includes$pss5_r + survey_includes$pss6 + survey_includes$pss7_r + survey_includes$pss8_r + survey_includes$pss9 + survey_includes$pss10) / 10)
survey_no.na <- survey_includes[!is.na(survey_includes$mean_stress),]
Visualising data:
hist(x = survey_no.na$mean_stress,
breaks = 10,
xlab = 'Stress',
main = 'Perceived stress')
Conducting the ANOVA test:
survey_anova <- aov(data = survey_no.na, formula = mean_stress ~ source)
survey_anova_residuals <- residuals(survey_anova)
qqnorm(y = survey_anova_residuals)
qqline(y = survey_anova_residuals)
summary(survey_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## source 2 2.13 1.0656 3.105 0.0493 *
## Residuals 96 32.94 0.3432
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
etaSquared(survey_anova)
## eta.sq eta.sq.part
## source 0.0607634 0.0607634
Visualising results of ANOVA test:
survey_split <- (split(survey_no.na$mean_stress, survey_no.na$source))
survey_means <- c(mean(survey_split$Family), mean(survey_split$Health), mean(survey_split$Money))
survey_labels <- c('Family', 'Health/Illness', 'Money/Finances')
barplot(survey_means,
ylim = c(0, 4),
main = 'Sources of stress',
ylab = 'Level of stress',
xlab = 'Source',
names.arg = survey_labels,
col = rainbow(4))
Creating variables to use in APA reporting:
n.family <- length(survey_split$Family)
n.health <- length(survey_split$Health)
n.money <- length(survey_split$Money)
survey_anova_sm <- summary(survey_anova)
survey.anova.effect <- etaSquared(survey_anova)
survey.anova.f <- survey_anova_sm[[1]][["Pr(>F)"]]
Reporting of findings
The data set contains 26 (M = 2.5538462) subjects who identified family as a source of stress, 20 subjects who identified health and/or illness as a source of stress (M = 2.95), and 53 subjects who identified money or finances as a source of stress (M = 2.6056604). An ANOVA test was conducted to evaluate the hypothesis that stress levels do not differ between sources of stress. The results of this ANOVA test suggested that stress levels differ significantly (N2 = 0.0607634, 0.0607634, f = 0.0493401, NA).
Loading the dataset (PU7_exercise2_ads.csv) and exploring it:
ads <- read.csv('C:/Users/jesse/Documents/PM DSS/Statistics/PU7_exercise2_ads.csv', header = TRUE)
summary(ads)
## gender age animation tagline mean_att_overall
## female:51 Min. :16.00 no :77 no :73 Min. :1.467
## male :97 1st Qu.:21.00 yes:71 yes:75 1st Qu.:2.800
## Median :25.00 Median :3.200
## Mean :29.82 Mean :3.199
## 3rd Qu.:33.25 3rd Qu.:3.600
## Max. :62.00 Max. :5.000
str(ads)
## 'data.frame': 148 obs. of 5 variables:
## $ gender : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
## $ age : int 20 21 24 45 27 20 20 59 39 19 ...
## $ animation : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ tagline : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ mean_att_overall: num 2 2.93 3.4 3.93 2.47 ...
head(ads, n = 10)
Testing the following hypotheses:
Main effects
h1: The effect of animation on attitude will be stronger if the ad containts a tagline (interaction).
Creating ANOVA model:
ads_anova <- aov(data = ads, formula = mean_att_overall ~ animation * tagline) # tagline is the interaction in the ANOVA model
ads_anova_residuals <- residuals(ads_anova)
qqnorm(ads_anova_residuals)
qqline(ads_anova_residuals)
Conducting ANOVA test:
summary(ads_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## animation 1 1.98 1.985 5.757 0.01771 *
## tagline 1 7.00 7.004 20.314 0.0000135 ***
## animation:tagline 1 2.91 2.912 8.445 0.00424 **
## Residuals 144 49.65 0.345
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
etaSquared(ads_anova)
## eta.sq eta.sq.part
## animation 0.03218073 0.03836371
## tagline 0.11379282 0.12362809
## animation:tagline 0.04730816 0.05539856
Because the tested interaction is yes or no, I used an interaction plot to visualise the data, instead of a barplot:
ads %>%
ggplot() +
aes(x = animation, color = tagline, group = tagline, y = mean_att_overall) +
stat_summary(fun.y = mean, geom = "point") +
stat_summary(fun.y = mean, geom = "line") +
ylim(3, 4.5) +
labs(title = "Interaction plot", x = "Animation", y = "Overall attitude")
## Warning: Removed 53 rows containing non-finite values (stat_summary).
## Warning: Removed 53 rows containing non-finite values (stat_summary).
Creating variables to use in APA reporting:
n.subjects.male <- length(subset(ads$gender, ads$gender == "male"))
n.subjects.female <- length(subset(ads$gender, ads$gender == "female"))
ads_anova_sm <- summary(ads_anova)
ads.anova.effect <- etaSquared(ads_anova)
ads.anova.f <- ads_anova_sm[[1]][["Pr(>F)"]]
Reporting of findings
The data set contains 97 male subjects and 51 female subjects. An ANOVA test was conducted to evaluate the hypothesis that the effect of animation of an ad will be stronger if the animation contains a tagline. This effect was indicated by a ‘mean overall attitude score’, or: attitude towards the ad. The results of this ANOVA test suggested that there is in fact a difference between an animation with and without a tagline. An animation with a tagline increases the overall positive attitude towards an ad (N2 = 0.0321807, 0.1137928, 0.0473082, 0.0383637, 0.1236281, 0.0553986, f = 0.0177057, 0.0000135, 0.0042388, NA).