dat <- read.csv("color-score.csv", sep=';')
dat[is.na(dat$score),]
as.numeric(dat$score)
## Warning: NAs introduced by coercion
## [1] 0 42 22 NA 0 6 68 28 38 38 88 0 37 0 57 5 40 24 24 NA 31 55 38 21 44
## [26] 52 37 49 11 7 21 27 15 26 43 21 59 0 NA 52 36 46 24 21 17 0 60 25 52 20
## [51] 3 38 63 40 45 13 34 25 NA 12 34 55 23 28 64 NA 71 57 20 25 NA 56 33 74 79
## [76] NA 53 26 31 31 43 12 21 23 60 15 35 NA 19 71 NA 33 64 78 64 35 45 48 30 23
## [101] 49 69 61 82 35 24 26 41 0 60 7 35 40 72 28 65 19 24 22 41 39 21 69 31 NA
## [126] 79 48 62 45 35 63 28 59 NA 0 33 35 44 28 29 43 30 47 28 30 62 29 39 22 40
## [151] 22 34 NA 66 44 51 11 12 59 15 33 57 31 30 42 19 69 9 67 61 55 35 22 59 57
## [176] 75 26 72 18 48 82 NA 21 11 38 68 17 82 6 49 NA 35 75 56 70 56 37 35 15 32
## [201] 38 43 30 24 56 11 41 67 18 39 28 39 59 15 65 74 33 1 68 38 51 28 60 33 57
## [226] 27 46 44 40 71 41 13 18 34 28 73 NA 60 20 NA 0 32 NA 52 45 54 37 22 68 32
## [251] NA 39 NA NA 26 21 21 NA NA NA 26 3 NA
Look at the rows that cannot be converted to numeric
Are there any other strings that are not numbers and are neither “abs” nor “коммуникация”
dat[is.na(as.numeric(dat$score)),]
## Warning in `[.data.frame`(dat, is.na(as.numeric(dat$score)), ): NAs introduced
## by coercion
unique(
dat[is.na(as.numeric(dat$score)),]$score
)
## Warning in `[.data.frame`(dat, is.na(as.numeric(dat$score)), ): NAs introduced
## by coercion
## [1] "abs" "коммуникация"
dat <- read.csv("color-score.csv", sep=';')
dat$score <- as.numeric(dat$score)
## Warning: NAs introduced by coercion
dat[!is.na(dat$score),c(2, 3)] -> dat
nrow(dat)
## [1] 239
Can we conclude that color affects score?
boxplot(score ~ color, data=dat)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
dat %>% group_by(color) %>% summarise(score=mean(score))
(dat %>% filter(color=='b'))$score -> blue_score
(dat %>% filter(color=='o'))$score -> orange_score
t.test(blue_score, orange_score)
##
## Welch Two Sample t-test
##
## data: blue_score and orange_score
## t = -2.4037, df = 95.973, p-value = 0.01815
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -17.056724 -1.627465
## sample estimates:
## mean of x mean of y
## 31.63830 40.98039
10 * 0.01815
## [1] 0.1815
Analysis of variances
summary(aov(score ~ color, data=dat))
## Df Sum Sq Mean Sq F value Pr(>F)
## color 4 2725 681.3 1.652 0.162
## Residuals 234 96528 412.5
stddev_data <- sd(dat$score)
stddev_data
## [1] 20.42137
stderror_mean <- stddev_data / sqrt(length(dat$score))
stderror_mean
## [1] 1.320949
s <- stderror_mean * 1.96
# 1.96 is related to 95%
# for different confidence level
# it will be different
mean(dat$score)
## [1] 38.03347
c(mean(dat$score) - s, mean(dat$score) + s)
## [1] 35.44441 40.62253
t.test(dat$score)
##
## One Sample t-test
##
## data: dat$score
## t = 28.793, df = 238, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 35.43123 40.63572
## sample estimates:
## mean of x
## 38.03347