# install.packages(c("tidyverse", "janitor", "broom"))
library(tidyverse)
[30m── [1mAttaching packages[22m ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 3.0.0 [32m✔[30m [34mpurrr [30m 0.2.5
[32m✔[30m [34mtibble [30m 1.4.2 [32m✔[30m [34mdplyr [30m 0.7.6
[32m✔[30m [34mtidyr [30m 0.8.1 [32m✔[30m [34mstringr[30m 1.3.1
[32m✔[30m [34mreadr [30m 1.1.1 [32m✔[30m [34mforcats[30m 0.3.0[39m
[30m── [1mConflicts[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library(janitor)
library(broom)
# install.packages("viridis")
library(viridis)
Loading required package: viridisLite
df <- read_csv("datasetOralHealthLatvia.csv")
Parsed with column specification:
cols(
.default = col_integer(),
Time = col_character(),
`2_Examination_date` = col_character(),
child1 = col_character(),
`4_Birth_date` = col_character(),
C17V = col_character(),
C17D = col_character(),
C17O = col_character(),
C17M = col_character(),
C17L = col_character(),
C16V = col_character(),
C16D = col_character(),
C16O = col_character(),
C16M = col_character(),
C16L = col_character(),
C15V = col_character(),
C15D = col_character(),
C15O = col_character(),
C15M = col_character(),
C15L = col_character(),
C14V = col_character()
# ... with 159 more columns
)
See spec(...) for full column specifications.
df[df == 99] <- NA
df <- df %>%
select(-c(C17V:R41V))
df <- df %>%
select(-c(Age,
`2_Examination_date`,
RegionsKods,
SkolasKods,
child1,
`3d_Examination_time_(forst_or_second)`,
SkolasKods.y,
ID))
df <- df %>%
rename( Gender = `1_gender`,
`Live in` = `2_Live_in`,
Pain = `3_Pain_or_other_dental_disorders_in_last_12_months`,
Region = RegionName,
`School name` = SkolaName)
df <- df %>%
mutate( "D1 status" = case_when(
D1T > 0 ~ 1,
TRUE ~ 0
))
df <- df %>%
mutate( "D3 status" = case_when(
D3T > 0 ~ 1,
TRUE ~ 0
))
df <- df %>%
mutate( "D5 status" = case_when(
D5T > 0 ~ 1,
TRUE ~ 0
))
df_caries <- df %>%
gather(key = T, value = "value_t", D1T:D5MFT) %>%
gather(key = S, value = "value_s", D1S:D5MFS)
df_caries$T <- fct_relevel(df_caries$T, "D1T", "D3T", "D5T", "MT","FT", "D1MFT", "D3MFT", "D5MFT")
df_caries$S <- fct_relevel(df_caries$S, "D1S", "D3S", "D5S", "MS","FS", "D1MFS", "D3MFS", "D5MFS")
Datasets ready for analysis
Childrens per school
df %>%
group_by(`School name`) %>%
summarise(n = n()) %>%
mutate(average = mean(n))
which(is.na(df$`Live in`))
[1] 72 752 992 1086 1266 1828
df$`Live in`[is.na(df$`Live in`)] <- "City"
df %>%
group_by(`Live in`, Gender) %>%
summarise(n = n()) %>%
spread(Gender, n)
df <- df %>%
mutate(area = case_when(
`Live in` == "Country" ~ "Rural",
TRUE ~ "Urban"
))
tabyl(df, area, Gender)
area F M
Rural 223 253
Urban 808 854
df$FAS_cat <- fct_relevel(df$FAS_cat, "Low affluence", "Middle affluence", "High affluence")
df$FAS_cat[is.na(df$FAS_cat)] <- "Middle affluence"
df %>%
group_by(Gender, FAS_cat) %>%
summarise(n = n()) %>%
spread(Gender, n)
df %>%
#filter(D1MFT > 0) %>%
tabyl(D1MFT) %>%
adorn_pct_formatting(digits = 2)
D1MFT n percent
0 33 1.54%
1 37 1.73%
2 69 3.23%
3 86 4.02%
4 164 7.67%
5 170 7.95%
6 223 10.43%
7 148 6.92%
8 191 8.93%
9 163 7.62%
10 127 5.94%
11 122 5.71%
12 117 5.47%
13 72 3.37%
14 72 3.37%
15 63 2.95%
16 59 2.76%
17 46 2.15%
18 27 1.26%
19 29 1.36%
20 29 1.36%
21 16 0.75%
22 25 1.17%
23 13 0.61%
24 16 0.75%
25 3 0.14%
26 8 0.37%
27 7 0.33%
28 3 0.14%
df %>%
#filter(D3MFT > 0) %>%
tabyl(D3MFT) %>%
adorn_pct_formatting(digits = 2)
D3MFT n percent
0 433 20.25%
1 241 11.27%
2 281 13.14%
3 265 12.39%
4 314 14.69%
5 183 8.56%
6 142 6.64%
7 80 3.74%
8 61 2.85%
9 54 2.53%
10 24 1.12%
11 21 0.98%
12 13 0.61%
13 9 0.42%
14 4 0.19%
15 6 0.28%
16 2 0.09%
17 1 0.05%
18 3 0.14%
20 1 0.05%
df %>%
#filter(D5MFT > 0) %>%
tabyl(D5MFT) %>%
adorn_pct_formatting(digits = 2)
D5MFT n percent
0 600 28.06%
1 319 14.92%
2 320 14.97%
3 284 13.28%
4 259 12.11%
5 114 5.33%
6 90 4.21%
7 55 2.57%
8 36 1.68%
9 20 0.94%
10 17 0.80%
11 10 0.47%
12 6 0.28%
13 6 0.28%
15 1 0.05%
16 1 0.05%
df %>%
filter(D1MFT > 0) %>%
tabyl(Gender) %>%
adorn_pct_formatting(digits = 1)
Gender n percent
F 1017 48.3%
M 1088 51.7%
df %>%
filter(D3MFT > 0) %>%
group_by(Gender) %>%
summarise(n = n())
df %>%
filter(D5MFT > 0) %>%
group_by(Gender) %>%
summarise(n = n())
df %>%
filter(D1MFT > 0) %>%
group_by(Region) %>%
summarise(n = n())
df %>%
filter(D3MFT > 0) %>%
group_by(Region) %>%
summarise(n = n())
df %>%
filter(D5MFT > 0) %>%
group_by(Region) %>%
summarise(n = n())
df %>%
filter(D1MFT > 0) %>%
group_by(area) %>%
summarise(n = n())
df %>%
filter(D3MFT > 0) %>%
group_by(area) %>%
summarise(n = n())
df %>%
filter(D5MFT > 0) %>%
group_by(area) %>%
summarise(n = n())
df %>%
filter(D1MFT > 0) %>%
group_by(FAS_cat) %>%
summarise(n = n())
df %>%
filter(D3MFT > 0) %>%
group_by(FAS_cat) %>%
summarise(n = n())
df %>%
filter(D5MFT > 0) %>%
group_by(FAS_cat) %>%
summarise(n = n())
df_caries %>%
group_by(T) %>%
summarise(Mean = mean(value_t),
sd = sd(value_t),
`25%`=quantile(value_t, probs=0.25),
`50%`=quantile(value_t, probs=0.5),
`75%`=quantile(value_t, probs=0.75))
df_caries %>%
group_by(S) %>%
summarise(Mean = mean(value_s),
sd = sd(value_s),
`25%`=quantile(value_s, probs=0.25),
`50%`=quantile(value_s, probs=0.5),
`75%`=quantile(value_s, probs=0.75))
df_caries %>%
filter(T %in% c("D1T", "D3T", "D5T", "MT", "FT")) %>%
group_by(T, Region) %>%
summarise(mean_t = mean(value_t)) %>%
ggplot(aes(x = Region , y = mean_t, fill = T)) +
geom_col() +
coord_flip() +
theme_classic() +
labs(title = "Caries severity by region in 12-year-old children, Latvia, 2016",
x = "Region",
y = "Mean value",
fill = "Legend") +
scale_fill_viridis_d(direction = -1)
df_caries %>%
group_by(T, Gender) %>%
summarise(mean = mean(value_t)) %>%
spread(T, mean)
Cuantos ni?os tienes caries en distintos niveles?
table(df$`D1 status`)
0 1
115 2023
table(df$`D3 status`)
0 1
1201 937
table(df$`D5 status`)
0 1
1668 470
df %>%
gather(key = "Caries_threshold", value = value, `D1 status`:`D5 status`) %>%
select(Gender, `School name`, Region, Caries_threshold, value) %>%
filter(value == 1) %>%
group_by(Caries_threshold, Gender) %>%
summarise(n = n()) %>%
spread(Gender, n)
df <- df %>% mutate(`Visits to dentist last year (1 = less than once per year)` =
case_when(
`4_Frequency_of_dentist_visits_in_last_12_months` %in% c("Two or more times", "One time") ~ 0,
TRUE ~ 1
)
)
df <- df %>% mutate(`Visits to hygienist last year (1 = less than once per year)` =
case_when(
`7_Frequency_of_dental_hygienist_visits` %in% c("Two or more times per year", "One time per year") ~ 0,
TRUE ~ 1
)
)
df <- df %>% mutate(`Frequency of tooth-brushing (1 = less than once per day)` =
case_when(
`8_Frequency_of_toothbrushing` %in% c( "Two or more times per day" , "Once per day ") ~ 0,
TRUE ~ 1
)
)
df <- df %>% mutate(`Family affluence according to the FAS scale (1 = low family affluence)` =
case_when(
FAS_cat == "Low affluence" ~ 1,
TRUE ~ 0
)
)
df <- df %>%
mutate(`Flossing (1 = dental floss not used)` =
case_when(
`9_Usage_of_dental_floss` == "Yes" ~ 0,
TRUE ~ 1
)
)
df <- df %>%
mutate(`Mouthwash (1 = mouthwash not used)` =
case_when(
`9_Usage_of_mouth_wash` == "Yes" ~ 0,
TRUE ~ 1
)
)
df <- df %>%
mutate( `Diet (1 = high in fermentable carbohydrates)` =
case_when(
`13_Eating_habits_grouped` == 1 ~ 0,
TRUE ~ 1
))
df <- df %>%
mutate( `Added sugar in hot drinks (1= extra sugar added to hot drinks)` =
case_when(
tsp_daily > 1 ~ 1,
TRUE ~ 0
))
df <- df %>%
mutate( `Area (1 = Rural)` =
case_when(
area == "Rural" ~ 1,
TRUE ~ 0
))
df <- df %>%
mutate( `Smoking (1 = Yes)` =
case_when(
area == "Rural" ~ 1,
TRUE ~ 0
))
df.for.model <- df %>%
select(`D1 status`:`Smoking (1 = Yes)`)
df.for.model <- janitor::clean_names(df.for.model)
names(df.for.model)
[1] "d1_status" "d3_status"
[3] "d5_status" "area"
[5] "visits_to_dentist_last_year_1_less_than_once_per_year" "visits_to_hygienist_last_year_1_less_than_once_per_year"
[7] "frequency_of_tooth_brushing_1_less_than_once_per_day" "family_affluence_according_to_the_fas_scale_1_low_family_affluence"
[9] "flossing_1_dental_floss_not_used" "mouthwash_1_mouthwash_not_used"
[11] "diet_1_high_in_fermentable_carbohydrates" "added_sugar_in_hot_drinks_1_extra_sugar_added_to_hot_drinks"
[13] "area_1_rural" "smoking_1_yes"
d1_model <-
glm(d1_status ~
visits_to_dentist_last_year_1_less_than_once_per_year +
visits_to_hygienist_last_year_1_less_than_once_per_year +
frequency_of_tooth_brushing_1_less_than_once_per_day +
family_affluence_according_to_the_fas_scale_1_low_family_affluence +
flossing_1_dental_floss_not_used +
mouthwash_1_mouthwash_not_used +
diet_1_high_in_fermentable_carbohydrates +
added_sugar_in_hot_drinks_1_extra_sugar_added_to_hot_drinks +
area_1_rural +
smoking_1_yes,
data = df.for.model,
family = binomial)
d3_model <-
glm(d3_status ~
visits_to_dentist_last_year_1_less_than_once_per_year +
visits_to_hygienist_last_year_1_less_than_once_per_year +
frequency_of_tooth_brushing_1_less_than_once_per_day +
family_affluence_according_to_the_fas_scale_1_low_family_affluence +
flossing_1_dental_floss_not_used +
mouthwash_1_mouthwash_not_used +
diet_1_high_in_fermentable_carbohydrates +
added_sugar_in_hot_drinks_1_extra_sugar_added_to_hot_drinks +
area_1_rural +
smoking_1_yes,,
data = df.for.model,
family = binomial)
d5_model <-
glm(d5_status ~
visits_to_dentist_last_year_1_less_than_once_per_year +
visits_to_hygienist_last_year_1_less_than_once_per_year +
frequency_of_tooth_brushing_1_less_than_once_per_day +
family_affluence_according_to_the_fas_scale_1_low_family_affluence +
flossing_1_dental_floss_not_used +
mouthwash_1_mouthwash_not_used +
diet_1_high_in_fermentable_carbohydrates +
added_sugar_in_hot_drinks_1_extra_sugar_added_to_hot_drinks +
area_1_rural +
smoking_1_yes,,
data = df.for.model,
family = binomial)
dep_var_labels=c("D1 enamel lesion = 1","D3 Cavitated enamel lesion = 1", "D5 dentin cavitated lesion = 1")
cov_labels = c("Frequency of dentist visits (1 = less than once per year)",
"Frequency of dental hygienist visits (1 = less than once per year)",
"Frequency of tooth-brushing (1 = less than once per day)",
"Family affluence according to the FAS scale (1 = low family affluence)",
"Flossing (1 = dental floss not used)",
"Fluoridated mouthwash (1 = mouthwash not used)",
"Diet (1 = high in fermentable carbohydrates)",
"Added sugar in hot drinks (1 = extra sugar added to hot drinks)",
"Area (1 = rural)",
"Smoking (1 = participant has been or is currently a smoker)")
stargazer::stargazer(d1_model, d3_model, d5_model, type = "text",
dep.var.labels = dep_var_labels,
covariate.labels = cov_labels,
no.space=FALSE,
apply.coef = exp,
# apply.se = exp,
ci=TRUE, ci.level=0.90, single.row=TRUE,
omit.table.layout = "n", star.cutoffs = NA) # omit p values FTW!
============================================================================================================================================================
Dependent variable:
-------------------------------------------------------------------------------------
D1 enamel lesion = 1 D3 Cavitated enamel lesion = 1 D5 dentin cavitated lesion = 1
(1) (2) (3)
------------------------------------------------------------------------------------------------------------------------------------------------------------
Frequency of dentist visits (1 = less than once per year) 1.443 (1.022, 1.863) 1.030 (0.854, 1.205) 1.137 (0.932, 1.343)
Frequency of dental hygienist visits (1 = less than once per year) 1.087 (0.725, 1.449) 1.323 (1.161, 1.486) 1.297 (1.107, 1.488)
Frequency of tooth-brushing (1 = less than once per day) 0.905 (0.578, 1.232) 1.040 (0.890, 1.190) 1.135 (0.955, 1.315)
Family affluence according to the FAS scale (1 = low family affluence) 1.102 (0.703, 1.501) 1.209 (1.029, 1.388) 1.657 (1.454, 1.860)
Flossing (1 = dental floss not used) 0.862 (0.509, 1.214) 1.101 (0.942, 1.260) 1.047 (0.854, 1.240)
Fluoridated mouthwash (1 = mouthwash not used) 1.042 (0.721, 1.362) 0.779 (0.632, 0.926) 0.707 (0.530, 0.884)
Diet (1 = high in fermentable carbohydrates) 0.823 (0.430, 1.216) 1.133 (0.962, 1.304) 1.145 (0.936, 1.353)
Added sugar in hot drinks (1 = extra sugar added to hot drinks) 0.997 (0.655, 1.338) 1.118 (0.962, 1.274) 0.979 (0.791, 1.167)
Area (1 = rural) 0.666 (0.313, 1.019) 1.286 (1.111, 1.460) 1.093 (0.886, 1.299)
Smoking (1 = participant has been or is currently a smoker)
Constant 22.887 (22.403, 23.372) 0.567 (0.356, 0.778) 0.215 (-0.043, 0.472)
------------------------------------------------------------------------------------------------------------------------------------------------------------
Observations 2,138 2,138 2,138
Log Likelihood -444.025 -1,448.976 -1,106.633
Akaike Inf. Crit. 908.049 2,917.951 2,233.266
============================================================================================================================================================
http://people.umass.edu/biep640w/pdf/R-for-Logistic-Regression.pdf
exp(cbind(OR = coef(d1_model), confint(d1_model)))
Waiting for profiling to be done...
OR 2.5 % 97.5 %
(Intercept) 22.8872535 13.1414912 41.780056
visits_to_dentist_last_year_1_less_than_once_per_year 1.4425234 0.8911760 2.436952
visits_to_hygienist_last_year_1_less_than_once_per_year 1.0868701 0.7118825 1.691850
frequency_of_tooth_brushing_1_less_than_once_per_day 0.9052952 0.6122190 1.336395
family_affluence_according_to_the_fas_scale_1_low_family_affluence 1.1018498 0.6961407 1.807529
flossing_1_dental_floss_not_used 0.8615879 0.5604467 1.300603
mouthwash_1_mouthwash_not_used 1.0416616 0.7100637 1.526820
diet_1_high_in_fermentable_carbohydrates 0.8230721 0.5049540 1.293786
added_sugar_in_hot_drinks_1_extra_sugar_added_to_hot_drinks 0.9965328 0.6573850 1.487291
area_1_rural 0.6659909 0.4411824 1.025628
smoking_1_yes NA NA NA
exp(cbind(OR = coef(d3_model), confint(d3_model)))
Waiting for profiling to be done...
OR 2.5 % 97.5 %
(Intercept) 0.5669106 0.4401196 0.7284624
visits_to_dentist_last_year_1_less_than_once_per_year 1.0296318 0.8352121 1.2683170
visits_to_hygienist_last_year_1_less_than_once_per_year 1.3232723 1.0906509 1.6056598
frequency_of_tooth_brushing_1_less_than_once_per_day 1.0401088 0.8699992 1.2433377
family_affluence_according_to_the_fas_scale_1_low_family_affluence 1.2085578 0.9755883 1.4966909
flossing_1_dental_floss_not_used 1.1010536 0.9115183 1.3306871
mouthwash_1_mouthwash_not_used 0.7791422 0.6539677 0.9278543
diet_1_high_in_fermentable_carbohydrates 1.1330486 0.9244956 1.3902428
added_sugar_in_hot_drinks_1_extra_sugar_added_to_hot_drinks 1.1178666 0.9284074 1.3467105
area_1_rural 1.2858698 1.0442670 1.5831942
smoking_1_yes NA NA NA
exp(cbind(OR = coef(d5_model), confint(d5_model)))
Waiting for profiling to be done...
OR 2.5 % 97.5 %
(Intercept) 0.2147169 0.1572149 0.2906101
visits_to_dentist_last_year_1_less_than_once_per_year 1.1371828 0.8883499 1.4497237
visits_to_hygienist_last_year_1_less_than_once_per_year 1.2972198 1.0324510 1.6268799
frequency_of_tooth_brushing_1_less_than_once_per_day 1.1348326 0.9158264 1.4066450
family_affluence_according_to_the_fas_scale_1_low_family_affluence 1.6571704 1.2987157 2.1080211
flossing_1_dental_floss_not_used 1.0468571 0.8331759 1.3193214
mouthwash_1_mouthwash_not_used 0.7069064 0.5724147 0.8720978
diet_1_high_in_fermentable_carbohydrates 1.1448231 0.8956698 1.4721138
added_sugar_in_hot_drinks_1_extra_sugar_added_to_hot_drinks 0.9788682 0.7834913 1.2262254
area_1_rural 1.0926469 0.8515552 1.3945433
smoking_1_yes NA NA NA
stargazer::stargazer(d1_model, d3_model, d5_model,
apply.coef = OR,
title = "Logistic Regression of Caries Risk Factors for 12-year-old Children in Latvia, 2016",
type = "text")
Logistic Regression of Caries Risk Factors for 12-year-old Children in Latvia, 2016
==================================================================================================
Dependent variable:
-------------------------------
d1_status d3_status d5_status
(1) (2) (3)
--------------------------------------------------------------------------------------------------
visits_to_dentist_last_year_1_less_than_once_per_year 1.443*** 1.030*** 1.137***
(0.255) (0.107) (0.125)
visits_to_hygienist_last_year_1_less_than_once_per_year 1.087*** 1.323*** 1.297***
(0.220) (0.099) (0.116)
frequency_of_tooth_brushing_1_less_than_once_per_day 0.905*** 1.040*** 1.135***
(0.199) (0.091) (0.109)
family_affluence_according_to_the_fas_scale_1_low_family_affluence 1.102*** 1.209*** 1.657***
(0.242) (0.109) (0.123)
flossing_1_dental_floss_not_used 0.862*** 1.101*** 1.047***
(0.214) (0.096) (0.117)
mouthwash_1_mouthwash_not_used 1.042*** 0.779*** 0.707***
(0.195) (0.089) (0.107)
diet_1_high_in_fermentable_carbohydrates 0.823*** 1.133*** 1.145***
(0.239) (0.104) (0.127)
added_sugar_in_hot_drinks_1_extra_sugar_added_to_hot_drinks 0.997*** 1.118*** 0.979***
(0.208) (0.095) (0.114)
area_1_rural 0.666*** 1.286*** 1.093***
(0.215) (0.106) (0.126)
smoking_1_yes
Constant 22.887*** 0.567*** 0.215
(0.295) (0.128) (0.157)
--------------------------------------------------------------------------------------------------
Observations 2,138 2,138 2,138
Log Likelihood -444.025 -1,448.976 -1,106.633
Akaike Inf. Crit. 908.049 2,917.951 2,233.266
==================================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
# install.packages("ResourceSelection")
library(ResourceSelection)
ResourceSelection 0.3-2 2017-02-28
hoslem.test(d1_model$y, fitted(d1_model), g = 9)
Hosmer and Lemeshow goodness of fit (GOF) test
data: d1_model$y, fitted(d1_model)
X-squared = 5.8639, df = 7, p-value = 0.5557
pchisq(5.8639, df = 7, lower.tail = FALSE)
[1] 0.5557281
Evidence of a OVERALL GOODNESS OF FIT is reflected in a NON-SIGNIFICANT p-value. Here, the Hosmer-Lemeshow test p-value is 0.5557281, which suggest a good overall fit.
hoslem.test(d3_model$y, fitted(d3_model), g = 9)
Hosmer and Lemeshow goodness of fit (GOF) test
data: d3_model$y, fitted(d3_model)
X-squared = 3.5255, df = 7, p-value = 0.8325
pchisq(3.5255, df = 7, lower.tail = FALSE)
[1] 0.8325172
hoslem.test(d5_model$y, fitted(d5_model), g = 9)
Hosmer and Lemeshow goodness of fit (GOF) test
data: d5_model$y, fitted(d5_model)
X-squared = 4.0162, df = 7, p-value = 0.7779
pchisq(4.0162, df = 7, lower.tail = FALSE)
[1] 0.7779096