R Notebook

dat <- read.csv("color-score.csv", sep=';')

dat[is.na(dat$score),]

as.numeric(dat$score)

## Warning: NAs introduced by coercion

##   [1]  0 42 22 NA  0  6 68 28 38 38 88  0 37  0 57  5 40 24 24 NA 31 55 38 21 44
##  [26] 52 37 49 11  7 21 27 15 26 43 21 59  0 NA 52 36 46 24 21 17  0 60 25 52 20
##  [51]  3 38 63 40 45 13 34 25 NA 12 34 55 23 28 64 NA 71 57 20 25 NA 56 33 74 79
##  [76] NA 53 26 31 31 43 12 21 23 60 15 35 NA 19 71 NA 33 64 78 64 35 45 48 30 23
## [101] 49 69 61 82 35 24 26 41  0 60  7 35 40 72 28 65 19 24 22 41 39 21 69 31 NA
## [126] 79 48 62 45 35 63 28 59 NA  0 33 35 44 28 29 43 30 47 28 30 62 29 39 22 40
## [151] 22 34 NA 66 44 51 11 12 59 15 33 57 31 30 42 19 69  9 67 61 55 35 22 59 57
## [176] 75 26 72 18 48 82 NA 21 11 38 68 17 82  6 49 NA 35 75 56 70 56 37 35 15 32
## [201] 38 43 30 24 56 11 41 67 18 39 28 39 59 15 65 74 33  1 68 38 51 28 60 33 57
## [226] 27 46 44 40 71 41 13 18 34 28 73 NA 60 20 NA  0 32 NA 52 45 54 37 22 68 32
## [251] NA 39 NA NA 26 21 21 NA NA NA 26  3 NA

Look at the rows that cannot be converted to numeric

Are there any other strings that are not numbers and are neither “abs” nor “коммуникация”

dat[is.na(as.numeric(dat$score)),]

## Warning in `[.data.frame`(dat, is.na(as.numeric(dat$score)), ): NAs introduced
## by coercion

unique(
  dat[is.na(as.numeric(dat$score)),]$score
)

## Warning in `[.data.frame`(dat, is.na(as.numeric(dat$score)), ): NAs introduced
## by coercion

## [1] "abs"          "коммуникация"

dat <- read.csv("color-score.csv", sep=';')
dat$score <- as.numeric(dat$score)

## Warning: NAs introduced by coercion

dat[!is.na(dat$score),c(2, 3)] -> dat

nrow(dat)

## [1] 239

Can we conclude that color affects score?

boxplot(score ~ color, data=dat)

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

dat %>% group_by(color) %>% summarise(score=mean(score))

(dat %>% filter(color=='b'))$score -> blue_score

(dat %>% filter(color=='o'))$score -> orange_score

t.test(blue_score, orange_score)

## 
##  Welch Two Sample t-test
## 
## data:  blue_score and orange_score
## t = -2.4037, df = 95.973, p-value = 0.01815
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -17.056724  -1.627465
## sample estimates:
## mean of x mean of y 
##  31.63830  40.98039

10 * 0.01815

## [1] 0.1815

ANOVA

Analysis of variances

summary(aov(score ~ color, data=dat))

##              Df Sum Sq Mean Sq F value Pr(>F)
## color         4   2725   681.3   1.652  0.162
## Residuals   234  96528   412.5

stddev_data <- sd(dat$score)

stddev_data

## [1] 20.42137

stderror_mean <- stddev_data / sqrt(length(dat$score))

stderror_mean

## [1] 1.320949

s <- stderror_mean * 1.96
# 1.96 is related to 95%
# for different confidence level
# it will be different

mean(dat$score)

## [1] 38.03347

c(mean(dat$score) - s, mean(dat$score) + s)

## [1] 35.44441 40.62253

t.test(dat$score)

## 
##  One Sample t-test
## 
## data:  dat$score
## t = 28.793, df = 238, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  35.43123 40.63572
## sample estimates:
## mean of x 
##  38.03347