Packages

library(tidyverse)
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 2.2.1     v purrr   0.2.4
v tibble  1.4.2     v dplyr   0.7.4
v tidyr   0.8.0     v stringr 1.3.0
v readr   1.1.1     v forcats 0.3.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()

Dataset and data cleaning

df <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vStv7Pr69DtRKv6Nw6gVBep8hbT3pEeO6B1vNwxK_1DUHgpoTgbuRpZ4SvgtHFQnBZJVGeeQVyRuXZl/pub?gid=1261585400&single=true&output=csv")
Missing column names filled in: 'X6' [6], 'X7' [7], 'X9' [9]Parsed with column specification:
cols(
  N = col_integer(),
  `Merijumu punkts` = col_character(),
  Comparison = col_character(),
  `difference i/o` = col_integer(),
  `difference e/0` = col_integer(),
  X6 = col_character(),
  X7 = col_character(),
  Clave = col_character(),
  X9 = col_character()
)
df <- janitor::clean_names(df)
df  <- df %>% 
  select(merijumu_punkts:difference_e_0)
df <- df %>% 
  rename(intraoral = difference_i_o, 
         extraoral = difference_e_0)

EDA

Summary

summary(df)
 merijumu_punkts     comparison          intraoral       extraoral    
 Length:540         Length:540         Min.   : 0.00   Min.   : 0.00  
 Class :character   Class :character   1st Qu.:30.00   1st Qu.: 0.00  
 Mode  :character   Mode  :character   Median :30.00   Median : 0.00  
                                       Mean   :26.33   Mean   :13.06  
                                       3rd Qu.:30.00   3rd Qu.:30.00  
                                       Max.   :90.00   Max.   :30.00  

Long to wide dataset (for easier calculations)

df <- df %>% 
  gather(key = "measurement", value = "value", intraoral:extraoral)

N of measurements

df %>% 
  group_by(comparison, measurement) %>% 
  summarise(N = n()) %>% 
  spread(measurement, N)

Mean and sd by type of measurement

df %>% 
  group_by(comparison, measurement) %>% 
  summarise(Mean = mean(value)) %>% 
  spread(measurement, Mean)
df %>% 
  group_by(comparison, measurement) %>% 
  summarise(sd = sd(value)) %>% 
  spread(measurement, sd)

Distributions

df %>% 
  ggplot(aes(x = value)) + 
  geom_histogram(bins = 4) + 
  facet_grid(measurement ~ .) +
  theme_minimal() + 
  labs(
    title = "Distribution of measurement", 
    y = "Count", 
    x = "Difference"
  )

Boxplot

df %>% 
  ggplot(aes(x = measurement, y = value)) + 
  geom_boxplot() + 
  theme_minimal()

df %>% 
  group_by(comparison, measurement) %>% 
  summarise(Mean = mean(value)) %>% 
  ggplot(aes(x = fct_reorder(comparison, Mean), y = Mean)) + 
  geom_col() + 
  facet_grid(. ~ measurement) + 
  theme_minimal()

Se observan diferencias entre las mediciones intraorales y las extraorales. En general, las differencias son menores para las extraorales

Hay diferencias entre grupos?

tres factores - intra vs extraoral: 2 niveles - lugar de la medición: 19 niveles - comparación (1 vs 2, etc)

comparacion <-  aov(df$value ~ df$merijumu_punkts + df$comparison + df$measurement)
summary(comparacion)
                     Df Sum Sq Mean Sq F value Pr(>F)    
df$merijumu_punkts   59  53949     914   3.935 <2e-16 ***
df$comparison         8   3297     412   1.773 0.0785 .  
df$measurement        1  47601   47601 204.826 <2e-16 ***
Residuals          1011 234952     232                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Interpretación: hay diferencias significativas entre puntos y entre intra/extra

df %>% 
  group_by(merijumu_punkts, measurement) %>% 
  summarise(mean = mean(value)) %>% 
  # spread(measurement, mean) %>% 
  ggplot(aes(x = fct_reorder(merijumu_punkts, mean), y = mean)) + 
  geom_boxplot() + 
  theme_minimal() +
  coord_flip()

df %>% 
  ggplot(aes(x = fct_reorder(merijumu_punkts, value), y = value)) + 
  geom_col() + 
  #coord_flip() + 
  facet_wrap(~measurement)

NA

library(tidyverse)
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 2.2.1     v purrr   0.2.4
v tibble  1.4.2     v dplyr   0.7.4
v tidyr   0.8.0     v stringr 1.3.0
v readr   1.1.1     v forcats 0.3.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()

Aira 2

df2 <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vStv7Pr69DtRKv6Nw6gVBep8hbT3pEeO6B1vNwxK_1DUHgpoTgbuRpZ4SvgtHFQnBZJVGeeQVyRuXZl/pub?gid=20675042&single=true&output=csv")
Parsed with column specification:
cols(
  `Merijumu punkts` = col_character(),
  `1_1` = col_integer(),
  `1_2` = col_integer(),
  `1_3` = col_integer(),
  `1_4` = col_integer(),
  `1_5` = col_integer(),
  `1_6` = col_integer(),
  `1_7` = col_integer(),
  `1_8` = col_integer(),
  `1_9` = col_integer(),
  `1_10` = col_integer(),
  Measurement = col_character()
)
df2 <- janitor::clean_names(df2)
head(df2)
df2 <- df2 %>% 
  gather(key = "comparison", value = "value", x1_1:x1_10)
df2 %>% 
  arrange(desc(value))
package 㤼㸱bindrcpp㤼㸲 was built under R version 3.4.4

voy a reemplazar el extraoral = 300 por 30

which(df2$value == 300)
[1] 1620

lo cambio

df2$value[1620] = 30

ordeno los factores

df2$measurement <- factor(df2$measurement, levels = c("intraoral", "extraoral", "intra_vs_extra"))
table(df2$measurement)

     intraoral      extraoral intra_vs_extra 
           600            600            600 

cambio por valores absolutos

df2$value <- abs(df2$value)

MEAN per group

SD

Boxplot per group

intra and extraoral

intra_vs_extra

Inferential

table(df2_intra_vs_extra$measurement)

intra_vs_extra 
           600 
summary(model_intra_vs_extra)
                 Df  Sum Sq Mean Sq F value   Pr(>F)    
merijumu_punkts  59 1474927   24999  55.969  < 2e-16 ***
comparison        9   31107    3456   7.738 9.78e-11 ***
Residuals       531  237173     447                     
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
mean(df2_intra_vs_extra$value)
[1] 89.43333

Convert the merijumu_punkts column

df2_intra_vs_extra$merijumu_punkts <-  str_replace_all(df2_intra_vs_extra$merijumu_punkts, " ", "")
Warning messages:
1: Unknown or uninitialised column: 'str_replace_all'. 
2: Unknown or uninitialised column: 'str_replace_all'. 
df2_intra_vs_extra <- df2_intra_vs_extra %>% 
  separate(merijumu_punkts, c("merijumu_punkts", "delete"), sep = "L") %>% 
  separate(merijumu_punkts, c("merijumu_punkts", "delete"), sep = "V")
Expected 2 pieces. Missing pieces filled with `NA` in 600 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].Expected 2 pieces. Missing pieces filled with `NA` in 600 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

---
title: "R Notebook"
output: 
  html_notebook: 
    toc: yes
    toc_float: true
    fig_caption: true
---

# Packages
```{r}
library(tidyverse)
```

# Dataset and data cleaning
```{r}
df <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vStv7Pr69DtRKv6Nw6gVBep8hbT3pEeO6B1vNwxK_1DUHgpoTgbuRpZ4SvgtHFQnBZJVGeeQVyRuXZl/pub?gid=1261585400&single=true&output=csv")
```
```{r}

df <- janitor::clean_names(df)
df  <- df %>% 
  select(merijumu_punkts:difference_e_0)

df <- df %>% 
  rename(intraoral = difference_i_o, 
         extraoral = difference_e_0)
```

# EDA

## Summary
```{r}
summary(df)
```
## Long to wide dataset (for easier calculations)
```{r}
df <- df %>% 
  gather(key = "measurement", value = "value", intraoral:extraoral)
```

## N of measurements

```{r}
df %>% 
  group_by(comparison, measurement) %>% 
  summarise(N = n()) %>% 
  spread(measurement, N)
```
## Mean and sd by type of measurement
```{r}
df %>% 
  group_by(comparison, measurement) %>% 
  summarise(Mean = mean(value)) %>% 
  spread(measurement, Mean)
```
```{r}
df %>% 
  group_by(comparison, measurement) %>% 
  summarise(sd = sd(value)) %>% 
  spread(measurement, sd)

```

## Distributions

```{r}
df %>% 
  ggplot(aes(x = value)) + 
  geom_histogram(bins = 4) + 
  facet_grid(measurement ~ .) +
  theme_minimal() + 
  labs(
    title = "Distribution of measurement", 
    y = "Count", 
    x = "Difference"
  )
```

Boxplot
```{r}
df %>% 
  ggplot(aes(x = measurement, y = value)) + 
  geom_boxplot() + 
  theme_minimal()
```





```{r}
df %>% 
  group_by(comparison, measurement) %>% 
  summarise(Mean = mean(value)) %>% 
  ggplot(aes(x = fct_reorder(comparison, Mean), y = Mean)) + 
  geom_col() + 
  facet_grid(. ~ measurement) + 
  theme_minimal()
```
Se observan diferencias entre las mediciones intraorales y las extraorales. En general, las differencias son menores para las extraorales

# Hay diferencias entre grupos?
tres factores
 - intra vs extraoral: 2 niveles
 - lugar de la medición: 19 niveles
 - comparación (1 vs 2, etc)

```{r}
comparacion <-  aov(df$value ~ df$merijumu_punkts + df$comparison + df$measurement)
```

```{r}
summary(comparacion)
```

Interpretación: hay diferencias significativas entre puntos y entre intra/extra

```{r}
df %>% 
  group_by(merijumu_punkts, measurement) %>% 
  summarise(mean = mean(value)) %>% 
  # spread(measurement, mean) %>% 
  ggplot(aes(x = fct_reorder(merijumu_punkts, mean), y = mean)) + 
  geom_boxplot() + 
  theme_minimal() +
  coord_flip()
```

```{r}
df %>% 
  ggplot(aes(x = fct_reorder(merijumu_punkts, value), y = value)) + 
  geom_col() + 
  #coord_flip() + 
  facet_wrap(~measurement)
  


```



-----------------------------------------
```{r}
library(tidyverse)
```

# Aira 2
```{r}
df2 <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vStv7Pr69DtRKv6Nw6gVBep8hbT3pEeO6B1vNwxK_1DUHgpoTgbuRpZ4SvgtHFQnBZJVGeeQVyRuXZl/pub?gid=20675042&single=true&output=csv")
```

```{r}
df2 <- janitor::clean_names(df2)
head(df2)
```


```{r}
df2 <- df2 %>% 
  gather(key = "comparison", value = "value", x1_1:x1_10)
```


```{r}
df2 %>% 
  arrange(desc(value))
```
voy a reemplazar el extraoral = 300 por 30


```{r}
which(df2$value == 300)
```
lo cambio

```{r}
df2$value[1620] = 30
```


ordeno los factores
```{r}
df2$measurement <- factor(df2$measurement, levels = c("intraoral", "extraoral", "intra_vs_extra"))
```

```{r}
table(df2$measurement)
```

cambio por valores absolutos
```{r}
df2$value <- abs(df2$value)
```

## MEAN per group

```{r}
df2 %>% 
  group_by(measurement, comparison) %>% 
  summarise(mean = mean(value)) %>% 
  spread(measurement, mean) 
```
## SD
```{r}
df2 %>% 
  group_by(measurement, comparison) %>% 
  summarise(sd = sd(value)) %>% 
  spread(measurement, sd) 
```



## Boxplot per group

### intra and extraoral

```{r}
df2 %>% 
  filter(measurement != "intra_vs_extra") %>% 
  ggplot(aes(x = measurement, y = value)) + 
  geom_boxplot() + 
  geom_jitter(alpha = 0.05) +
  theme_minimal()
```

### intra_vs_extra

```{r}
df2 %>% 
  filter(measurement == "intra_vs_extra") %>% 
  ggplot(aes(x = measurement, y = value)) + 
  geom_boxplot() + 
  geom_jitter(alpha = 0.05) +
  theme_minimal()
```

```{r}
df2 %>% 
  filter(measurement == "intra_vs_extra") %>% 
  ggplot(aes(x  = value)) + 
  geom_histogram(bins = 5) + 
  theme_minimal()
```



```{r}
df2 %>% 
  filter(measurement == "intra_vs_extra") %>% 
  group_by(merijumu_punkts) %>% 
  summarise(mean = mean(value), sd = sd(value)) %>% 
  arrange(desc(mean))
```

# Inferential

```{r}
head(df2)
```
```{r}
# create a new df with only measurement = intra_vs_extra
df2_intra_vs_extra <- df2 %>% 
  filter(measurement == "intra_vs_extra") 

# drop unused factors
df2_intra_vs_extra$measurement <- factor(df2_intra_vs_extra$measurement)
```

```{r}
model_intra_vs_extra <- aov(value ~ merijumu_punkts + comparison, data = df2_intra_vs_extra)
summary(model_intra_vs_extra)
```

```{r}
df2_mean <- mean(df2_intra_vs_extra$value)
df2_sd <- sd(df2_intra_vs_extra$value)
```

```{r}

df2_intra_vs_extra %>% 
  ggplot(aes(x = fct_reorder(merijumu_punkts, value), y = value)) +
  geom_boxplot() +
  geom_hline(yintercept = df2_mean, lty = 2, color = "darkred") +
  geom_hline(yintercept = df2_mean - df2_sd, lty = 2, color = "darkgrey" ) + 
  geom_hline(yintercept = df2_mean + df2_sd, lty = 2, color = "darkgrey" ) +
  theme_minimal() + 
  coord_flip() +
  labs(
    title = "Edit For Title", 
    y = "Edit for y axis", 
    x = "Edit for x axis"
  )
```
Convert the merijumu_punkts column
```{r}
df2_intra_vs_extra <- 
  df2_intra_vs_extra %>% 
  mutate(merijumu_punkts = str_replace_all(merijumu_punkts, " ", "") )
```


```{r}
head(df2_intra_vs_extra)
```
```{r}
df2_intra_vs_extra <- df2_intra_vs_extra %>% 
  separate(merijumu_punkts, c("merijumu_punkts", "delete"), sep = "L") %>% 
  separate(merijumu_punkts, c("merijumu_punkts", "delete"), sep = "V") 
```

```{r}
df2_intra_vs_extra <- df2_intra_vs_extra %>% 
  select(-delete)
```

```{r}
df2_intra_vs_extra %>% 
  ggplot(aes(x = fct_reorder(merijumu_punkts, value), y = value)) +
  geom_boxplot() +
  geom_hline(yintercept = df2_mean, lty = 2, color = "darkred") +
  geom_hline(yintercept = df2_mean - df2_sd, lty = 2, color = "darkgrey" ) + 
  geom_hline(yintercept = df2_mean + df2_sd, lty = 2, color = "darkgrey" ) +
  theme_minimal() + 
  coord_flip() +
  labs(
    title = "Difference Between Teeth",
    subtitle = "Red line = mean, grey lines = standard deviation", 
    y = "Difference", 
    x = "Teeth"
  )
```


