An online version of the Notebook is available at http://rpubs.com/jvervaart/assignment5.

library(rmarkdown)
library(knitr)
library(lsr)
library(stats)
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

options(scipen=999) # restricting the use of the scientific notation

Task 1

Loading and inspecting the data (PU6-exercise-data.csv).

memory <- read.csv('C:/Users/jesse/Documents/PM DSS/Statistics/PU6-exercise-data.csv', header = TRUE)

str(memory)
## 'data.frame':    100 obs. of  3 variables:
##  $ id          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age         : int  21 61 20 24 54 53 56 57 38 43 ...
##  $ memory_score: num  71.1 67.7 65.3 81.2 80.7 ...
summary(memory)
##        id              age         memory_score  
##  Min.   :  1.00   Min.   :18.00   Min.   :52.36  
##  1st Qu.: 25.75   1st Qu.:27.75   1st Qu.:66.47  
##  Median : 50.50   Median :38.50   Median :73.26  
##  Mean   : 50.50   Mean   :39.80   Mean   :73.27  
##  3rd Qu.: 75.25   3rd Qu.:53.00   3rd Qu.:81.21  
##  Max.   :100.00   Max.   :65.00   Max.   :90.48
head(memory, n = 10)

Task 2

First dividing the age groups:

age_spread <- quantile(memory$age, prob = c(.33, .66))

memory$age_grouped <- ifelse(memory$age <= age_spread['33%'], 'Young', ifelse(memory$age <= age_spread['66%'], 'Middle.aged', 'Old'))

memory$age_grouped <- factor(memory$age_grouped)

str(memory) # Was turned into three levels
## 'data.frame':    100 obs. of  4 variables:
##  $ id          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ age         : int  21 61 20 24 54 53 56 57 38 43 ...
##  $ memory_score: num  71.1 67.7 65.3 81.2 80.7 ...
##  $ age_grouped : Factor w/ 3 levels "Middle.aged",..: 3 2 3 3 2 2 2 2 1 1 ...

Then, creating the ANOVA model:

memory_anova <- aov(data = memory, formula = memory_score ~ age_grouped)

Fetching residuals to create QQ-Plot:

memory_anova_residuals <- residuals(memory_anova)

Creating QQ-Plot:

qqnorm(memory_anova_residuals)

qqline(memory_anova_residuals)

After creating ANOVA model, we can look at the ANOVA test to look at the effect size:

summary(memory_anova)
##             Df Sum Sq Mean Sq F value Pr(>F)
## age_grouped  2    328  163.76   1.697  0.189
## Residuals   97   9363   96.52
etaSquared(memory_anova)
##                 eta.sq eta.sq.part
## age_grouped 0.03379913  0.03379913

Visualising the ANOVA test results:

memory_age_split <- split(memory$memory_score, memory$age_grouped)

memory_means <- c(mean(memory_age_split$Young), mean(memory_age_split$Middle.aged), mean(memory_age_split$Old))
memory_labels <- c('Young', 'Middle-aged', 'Old')

barplot(memory_means, 
        ylim = c(0, 100), 
        main = 'Memory scores', 
        ylab = 'Memory score', 
        xlab = 'Age', 
        names.arg = memory_labels, 
        col = rainbow(6)) # I experimented with rainbow() values until I found something that was pleasing to the eye

Creating variables to use in APA reporting:

n.young <- length(memory_age_split$Young)
n.middle <- length(memory_age_split$Middle.aged)
n.old <- length(memory_age_split$Old)

memory_anova_sm <- summary(memory_anova)
memory.anova.effect <- etaSquared(memory_anova)
memory.anova.f <- memory_anova_sm[[1]][["Pr(>F)"]]

Reporting of findings

The data set contains 36 younger subjects (M = 70.8662209), 30 middle-aged subjects (M = 74.8336047) and 34 older subjects (M = 74.8336047). An ANOVA test was conducted to evaluate the hypothesis that memory scores between age groups are equal. The results of this ANOVA test suggested that memory scores between age groups differ significantly (N2 = 0.0337991, f = 0.0337991).

Task 3

Loading and inspecting the data (Statistic_2017_PracticeUnit6_ANOVA_Survey Exercises.csv).

survey <- read.csv('C:/Users/jesse/Documents/PM DSS/Statistics/Statistic_2017_PracticeUnit6_ANOVA_Survey Exercises.csv', header = TRUE)

head(survey, n = 10)

Task 4

Including:

Only include prerequisites:

source_family <- subset(survey, survey$source == "FAMILY")
source_family$source <- revalue(source_family$source, c('FAMILY' = 'Family')) # Changing way value is represented in data set, to remove '/' in values

source_health <- subset(survey, survey$source == "HEALTH/ILLNESS")
source_health$source <- revalue(source_health$source, c('HEALTH/ILLNESS' = 'Health')) # Changing way value is represented in data set, to remove '/' in values

source_money <- subset(survey, survey$source == "MONEY/FINANCES")
source_money$source <- revalue(source_money$source, c('MONEY/FINANCES' = 'Money')) # Changing way value is represented in data set, to remove '/' in values

rmarkdown::paged_table(droplevels(source_family))
rmarkdown::paged_table(droplevels(source_health))
rmarkdown::paged_table(droplevels(source_money))
survey_includes <- rbind(source_family, source_health, source_money)

survey_includes$source <- factor(survey_includes$source)

rmarkdown::paged_table(survey_includes)

Calculating means:

# Reverse-worded (as per the assignment description) so 6 - 4x pss* variable
survey_includes$pss4_r <- 6 - survey_includes$pss4
survey_includes$pss5_r <- 6 - survey_includes$pss5
survey_includes$pss7_r <- 6 - survey_includes$pss7 
survey_includes$pss8_r <- 6 - survey_includes$pss8

survey_includes$mean_stress <- ((survey_includes$pss1 + survey_includes$pss2 + survey_includes$pss3 + survey_includes$pss4_r + survey_includes$pss5_r + survey_includes$pss6 + survey_includes$pss7_r + survey_includes$pss8_r + survey_includes$pss9 + survey_includes$pss10) / 10)

survey_no.na <- survey_includes[!is.na(survey_includes$mean_stress),]

Visualising data:

hist(x = survey_no.na$mean_stress,
     breaks = 10,
     xlab = 'Stress',
     main = 'Perceived stress')

Conducting the ANOVA test:

survey_anova <- aov(data = survey_no.na, formula = mean_stress ~ source)

survey_anova_residuals <- residuals(survey_anova)

qqnorm(y = survey_anova_residuals)
qqline(y = survey_anova_residuals)

summary(survey_anova)
##             Df Sum Sq Mean Sq F value Pr(>F)  
## source       2   2.13  1.0656   3.105 0.0493 *
## Residuals   96  32.94  0.3432                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
etaSquared(survey_anova)
##           eta.sq eta.sq.part
## source 0.0607634   0.0607634

Visualising results of ANOVA test:

survey_split <- (split(survey_no.na$mean_stress, survey_no.na$source))


survey_means <- c(mean(survey_split$Family), mean(survey_split$Health), mean(survey_split$Money))

survey_labels <- c('Family', 'Health/Illness', 'Money/Finances')

barplot(survey_means, 
        ylim = c(0, 4), 
        main = 'Sources of stress', 
        ylab = 'Level of stress', 
        xlab = 'Source', 
        names.arg = survey_labels, 
        col = rainbow(4))

Creating variables to use in APA reporting:

n.family <- length(survey_split$Family)
n.health <- length(survey_split$Health)
n.money <- length(survey_split$Money)

survey_anova_sm <- summary(survey_anova)
survey.anova.effect <- etaSquared(survey_anova)
survey.anova.f <- survey_anova_sm[[1]][["Pr(>F)"]]

Reporting of findings

The data set contains 26 (M = 2.5538462) subjects who identified family as a source of stress, 20 subjects who identified health and/or illness as a source of stress (M = 2.95), and 53 subjects who identified money or finances as a source of stress (M = 2.6056604). An ANOVA test was conducted to evaluate the hypothesis that stress levels do not differ between sources of stress. The results of this ANOVA test suggested that stress levels differ significantly (N2 = 0.0607634, 0.0607634, f = 0.0493401, NA).

Task 5

Loading the dataset (PU7_exercise2_ads.csv) and exploring it:

ads <- read.csv('C:/Users/jesse/Documents/PM DSS/Statistics/PU7_exercise2_ads.csv', header = TRUE)

summary(ads)
##     gender        age        animation tagline  mean_att_overall
##  female:51   Min.   :16.00   no :77    no :73   Min.   :1.467   
##  male  :97   1st Qu.:21.00   yes:71    yes:75   1st Qu.:2.800   
##              Median :25.00                      Median :3.200   
##              Mean   :29.82                      Mean   :3.199   
##              3rd Qu.:33.25                      3rd Qu.:3.600   
##              Max.   :62.00                      Max.   :5.000
str(ads)
## 'data.frame':    148 obs. of  5 variables:
##  $ gender          : Factor w/ 2 levels "female","male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ age             : int  20 21 24 45 27 20 20 59 39 19 ...
##  $ animation       : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ tagline         : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mean_att_overall: num  2 2.93 3.4 3.93 2.47 ...
head(ads, n = 10)

Task 6

Testing the following hypotheses:

Main effects

h1: The effect of animation on attitude will be stronger if the ad containts a tagline (interaction).

Creating ANOVA model:

ads_anova <- aov(data = ads, formula = mean_att_overall ~ animation * tagline) # tagline is the interaction in the ANOVA model

ads_anova_residuals <- residuals(ads_anova)

qqnorm(ads_anova_residuals)
qqline(ads_anova_residuals)

Conducting ANOVA test:

summary(ads_anova)
##                    Df Sum Sq Mean Sq F value    Pr(>F)    
## animation           1   1.98   1.985   5.757   0.01771 *  
## tagline             1   7.00   7.004  20.314 0.0000135 ***
## animation:tagline   1   2.91   2.912   8.445   0.00424 ** 
## Residuals         144  49.65   0.345                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
etaSquared(ads_anova)
##                       eta.sq eta.sq.part
## animation         0.03218073  0.03836371
## tagline           0.11379282  0.12362809
## animation:tagline 0.04730816  0.05539856

Because the tested interaction is yes or no, I used an interaction plot to visualise the data, instead of a barplot:

ads %>%
  ggplot() +
  aes(x = animation, color = tagline, group = tagline, y = mean_att_overall) +
  stat_summary(fun.y = mean, geom = "point") +
  stat_summary(fun.y = mean, geom = "line") +
  ylim(3, 4.5) +
  labs(title = "Interaction plot", x = "Animation", y = "Overall attitude")
## Warning: Removed 53 rows containing non-finite values (stat_summary).

## Warning: Removed 53 rows containing non-finite values (stat_summary).

Creating variables to use in APA reporting:

n.subjects.male <- length(subset(ads$gender, ads$gender == "male"))
n.subjects.female <- length(subset(ads$gender, ads$gender == "female"))

ads_anova_sm <- summary(ads_anova)
ads.anova.effect <- etaSquared(ads_anova)
ads.anova.f <- ads_anova_sm[[1]][["Pr(>F)"]]

Reporting of findings

The data set contains 97 male subjects and 51 female subjects. An ANOVA test was conducted to evaluate the hypothesis that the effect of animation of an ad will be stronger if the animation contains a tagline. This effect was indicated by a ‘mean overall attitude score’, or: attitude towards the ad. The results of this ANOVA test suggested that there is in fact a difference between an animation with and without a tagline. An animation with a tagline increases the overall positive attitude towards an ad (N2 = 0.0321807, 0.1137928, 0.0473082, 0.0383637, 0.1236281, 0.0553986, f = 0.0177057, 0.0000135, 0.0042388, NA).