LOAD LIBRARIES
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(DHARMa)
## This is DHARMa 0.4.7. For overview type '?DHARMa'. For recent changes, type news(package = 'DHARMa')
library(performance)
## Warning: package 'performance' was built under R version 4.5.2
library(nlme)
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:dplyr':
##
## collapse
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Attaching package: 'lme4'
##
## The following object is masked from 'package:nlme':
##
## lmList
library(mgcv)
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
library(gamlss)
## Loading required package: splines
## Loading required package: gamlss.data
##
## Attaching package: 'gamlss.data'
##
## The following object is masked from 'package:datasets':
##
## sleep
##
## Loading required package: gamlss.dist
## Loading required package: parallel
## ********** GAMLSS Version 5.5-0 **********
## For more on GAMLSS look at https://www.gamlss.com/
## Type gamlssNews() to see new features/changes/bug fixes.
##
##
## Attaching package: 'gamlss'
##
## The following object is masked from 'package:mgcv':
##
## lp
##
## The following object is masked from 'package:lme4':
##
## refit
##
## The following object is masked from 'package:DHARMa':
##
## getQuantile
library(quantreg)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
##
## The following object is masked from 'package:Matrix':
##
## det
library(corrplot)
## corrplot 0.95 loaded
FIRST LOOK
df <- read.csv("Autism.csv") %>% as_tibble()
dim(df) # rows and columns
## [1] 704 21
names(df) # column names
## [1] "A1_Score" "A2_Score" "A3_Score" "A4_Score"
## [5] "A5_Score" "A6_Score" "A7_Score" "A8_Score"
## [9] "A9_Score" "A10_Score" "age" "gender"
## [13] "ethnicity" "jundice" "austim" "contry_of_res"
## [17] "used_app_before" "result" "age_desc" "relation"
## [21] "Class.ASD"
head(df) # first 6 rows
## # A tibble: 6 × 21
## A1_Score A2_Score A3_Score A4_Score A5_Score A6_Score A7_Score A8_Score
## <int> <int> <int> <int> <int> <int> <int> <int>
## 1 1 1 1 1 0 0 1 1
## 2 1 1 0 1 0 0 0 1
## 3 1 1 0 1 1 0 1 1
## 4 1 1 0 1 0 0 1 1
## 5 1 0 0 0 0 0 0 1
## 6 1 1 1 1 1 0 1 1
## # ℹ 13 more variables: A9_Score <int>, A10_Score <int>, age <chr>,
## # gender <chr>, ethnicity <chr>, jundice <chr>, austim <chr>,
## # contry_of_res <chr>, used_app_before <chr>, result <int>, age_desc <chr>,
## # relation <chr>, Class.ASD <chr>
str(df) # data types - VERY important
## tibble [704 × 21] (S3: tbl_df/tbl/data.frame)
## $ A1_Score : int [1:704] 1 1 1 1 1 1 0 1 1 1 ...
## $ A2_Score : int [1:704] 1 1 1 1 0 1 1 1 1 1 ...
## $ A3_Score : int [1:704] 1 0 0 0 0 1 0 1 0 1 ...
## $ A4_Score : int [1:704] 1 1 1 1 0 1 0 1 0 1 ...
## $ A5_Score : int [1:704] 0 0 1 0 0 1 0 0 1 0 ...
## $ A6_Score : int [1:704] 0 0 0 0 0 0 0 0 0 1 ...
## $ A7_Score : int [1:704] 1 0 1 1 0 1 0 0 0 1 ...
## $ A8_Score : int [1:704] 1 1 1 1 1 1 1 0 1 1 ...
## $ A9_Score : int [1:704] 0 0 1 0 0 1 0 1 1 1 ...
## $ A10_Score : int [1:704] 0 1 1 1 0 1 0 0 1 0 ...
## $ age : chr [1:704] "26" "24" "27" "35" ...
## $ gender : chr [1:704] "f" "m" "m" "f" ...
## $ ethnicity : chr [1:704] "White-European" "Latino" "Latino" "White-European" ...
## $ jundice : chr [1:704] "no" "no" "yes" "no" ...
## $ austim : chr [1:704] "no" "yes" "yes" "yes" ...
## $ contry_of_res : chr [1:704] "'United States'" "Brazil" "Spain" "'United States'" ...
## $ used_app_before: chr [1:704] "no" "no" "no" "no" ...
## $ result : int [1:704] 6 5 8 6 2 9 2 5 6 8 ...
## $ age_desc : chr [1:704] "'18 and more'" "'18 and more'" "'18 and more'" "'18 and more'" ...
## $ relation : chr [1:704] "Self" "Self" "Parent" "Self" ...
## $ Class.ASD : chr [1:704] "NO" "NO" "YES" "NO" ...
CLEAN DATA
df[df == "?"] <- NA
colSums(is.na(df)) # check where NAs are
## A1_Score A2_Score A3_Score A4_Score A5_Score
## 0 0 0 0 0
## A6_Score A7_Score A8_Score A9_Score A10_Score
## 0 0 0 0 0
## age gender ethnicity jundice austim
## 2 0 95 0 0
## contry_of_res used_app_before result age_desc relation
## 0 0 0 0 95
## Class.ASD
## 0
colMeans(is.na(df)) * 100 # percentage missing per column
## A1_Score A2_Score A3_Score A4_Score A5_Score
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## A6_Score A7_Score A8_Score A9_Score A10_Score
## 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## age gender ethnicity jundice austim
## 0.2840909 0.0000000 13.4943182 0.0000000 0.0000000
## contry_of_res used_app_before result age_desc relation
## 0.0000000 0.0000000 0.0000000 0.0000000 13.4943182
## Class.ASD
## 0.0000000
df <- df %>% drop_na()
df$asd <- ifelse(df$Class.ASD == "YES", 1, 0)
df <- df %>%
mutate(
across(A1_Score:A10_Score, as.integer),
age = as.numeric(age),
result = as.numeric(result),
gender = factor(gender),
ethnicity = factor(ethnicity),
jundice = factor(jundice),
austim = factor(austim),
used_app_before = factor(used_app_before),
relation = factor(relation),
Class.ASD = factor(Class.ASD, levels = c("NO", "YES"))
)
str(df)
## tibble [609 × 22] (S3: tbl_df/tbl/data.frame)
## $ A1_Score : int [1:609] 1 1 1 1 1 0 1 1 1 1 ...
## $ A2_Score : int [1:609] 1 1 1 1 1 1 1 1 1 1 ...
## $ A3_Score : int [1:609] 1 0 0 0 1 0 1 0 1 1 ...
## $ A4_Score : int [1:609] 1 1 1 1 1 0 1 0 1 1 ...
## $ A5_Score : int [1:609] 0 0 1 0 1 0 0 1 0 1 ...
## $ A6_Score : int [1:609] 0 0 0 0 0 0 0 0 1 1 ...
## $ A7_Score : int [1:609] 1 0 1 1 1 0 0 0 1 1 ...
## $ A8_Score : int [1:609] 1 1 1 1 1 1 0 1 1 1 ...
## $ A9_Score : int [1:609] 0 0 1 0 1 0 1 1 1 1 ...
## $ A10_Score : int [1:609] 0 1 1 1 1 0 0 1 0 1 ...
## $ age : num [1:609] 26 24 27 35 36 17 64 29 17 33 ...
## $ gender : Factor w/ 2 levels "f","m": 1 2 2 1 2 1 2 2 2 2 ...
## $ ethnicity : Factor w/ 11 levels "'Middle Eastern '",..: 11 6 6 11 8 4 11 11 3 11 ...
## $ jundice : Factor w/ 2 levels "no","yes": 1 1 2 1 2 1 1 1 2 1 ...
## $ austim : Factor w/ 2 levels "no","yes": 1 2 2 2 1 1 1 1 2 1 ...
## $ contry_of_res : chr [1:609] "'United States'" "Brazil" "Spain" "'United States'" ...
## $ used_app_before: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ result : num [1:609] 6 5 8 6 9 2 5 6 8 10 ...
## $ age_desc : chr [1:609] "'18 and more'" "'18 and more'" "'18 and more'" "'18 and more'" ...
## $ relation : Factor w/ 5 levels "'Health care professional'",..: 5 5 3 5 5 5 3 5 1 4 ...
## $ Class.ASD : Factor w/ 2 levels "NO","YES": 1 1 2 1 2 1 1 1 2 2 ...
## $ asd : num [1:609] 0 0 1 0 1 0 0 0 1 1 ...